1 // SPDX-License-Identifier: GPL-2.0-or-later
3 * NET3 Protocol independent device support routines.
5 * Derived from the non IP parts of dev.c 1.0.19
7 * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
8 * Mark Evans, <evansmp@uhura.aston.ac.uk>
11 * Florian la Roche <rzsfl@rz.uni-sb.de>
12 * Alan Cox <gw4pts@gw4pts.ampr.org>
13 * David Hinds <dahinds@users.sourceforge.net>
14 * Alexey Kuznetsov <kuznet@ms2.inr.ac.ru>
15 * Adam Sulmicki <adam@cfar.umd.edu>
16 * Pekka Riikonen <priikone@poesidon.pspt.fi>
19 * D.J. Barrow : Fixed bug where dev->refcnt gets set
20 * to 2 if register_netdev gets called
21 * before net_dev_init & also removed a
22 * few lines of code in the process.
23 * Alan Cox : device private ioctl copies fields back.
24 * Alan Cox : Transmit queue code does relevant
25 * stunts to keep the queue safe.
26 * Alan Cox : Fixed double lock.
27 * Alan Cox : Fixed promisc NULL pointer trap
28 * ???????? : Support the full private ioctl range
29 * Alan Cox : Moved ioctl permission check into
31 * Tim Kordas : SIOCADDMULTI/SIOCDELMULTI
32 * Alan Cox : 100 backlog just doesn't cut it when
33 * you start doing multicast video 8)
34 * Alan Cox : Rewrote net_bh and list manager.
35 * Alan Cox : Fix ETH_P_ALL echoback lengths.
36 * Alan Cox : Took out transmit every packet pass
37 * Saved a few bytes in the ioctl handler
38 * Alan Cox : Network driver sets packet type before
39 * calling netif_rx. Saves a function
41 * Alan Cox : Hashed net_bh()
42 * Richard Kooijman: Timestamp fixes.
43 * Alan Cox : Wrong field in SIOCGIFDSTADDR
44 * Alan Cox : Device lock protection.
45 * Alan Cox : Fixed nasty side effect of device close
47 * Rudi Cilibrasi : Pass the right thing to
49 * Dave Miller : 32bit quantity for the device lock to
50 * make it work out on a Sparc.
51 * Bjorn Ekwall : Added KERNELD hack.
52 * Alan Cox : Cleaned up the backlog initialise.
53 * Craig Metz : SIOCGIFCONF fix if space for under
55 * Thomas Bogendoerfer : Return ENODEV for dev_open, if there
56 * is no device open function.
57 * Andi Kleen : Fix error reporting for SIOCGIFCONF
58 * Michael Chastain : Fix signed/unsigned for SIOCGIFCONF
59 * Cyrus Durgin : Cleaned for KMOD
60 * Adam Sulmicki : Bug Fix : Network Device Unload
61 * A network device unload needs to purge
63 * Paul Rusty Russell : SIOCSIFNAME
64 * Pekka Riikonen : Netdev boot-time settings code
65 * Andrew Morton : Make unregister_netdevice wait
66 * indefinitely on dev->refcnt
67 * J Hadi Salim : - Backlog queue sampling
68 * - netif_rx() feedback
71 #include <linux/uaccess.h>
72 #include <linux/bitops.h>
73 #include <linux/capability.h>
74 #include <linux/cpu.h>
75 #include <linux/types.h>
76 #include <linux/kernel.h>
77 #include <linux/hash.h>
78 #include <linux/slab.h>
79 #include <linux/sched.h>
80 #include <linux/sched/mm.h>
81 #include <linux/mutex.h>
82 #include <linux/rwsem.h>
83 #include <linux/string.h>
85 #include <linux/socket.h>
86 #include <linux/sockios.h>
87 #include <linux/errno.h>
88 #include <linux/interrupt.h>
89 #include <linux/if_ether.h>
90 #include <linux/netdevice.h>
91 #include <linux/etherdevice.h>
92 #include <linux/ethtool.h>
93 #include <linux/skbuff.h>
94 #include <linux/bpf.h>
95 #include <linux/bpf_trace.h>
96 #include <net/net_namespace.h>
98 #include <net/busy_poll.h>
99 #include <linux/rtnetlink.h>
100 #include <linux/stat.h>
103 #include <net/dst_metadata.h>
104 #include <net/pkt_sched.h>
105 #include <net/pkt_cls.h>
106 #include <net/checksum.h>
107 #include <net/xfrm.h>
108 #include <linux/highmem.h>
109 #include <linux/init.h>
110 #include <linux/module.h>
111 #include <linux/netpoll.h>
112 #include <linux/rcupdate.h>
113 #include <linux/delay.h>
114 #include <net/iw_handler.h>
115 #include <asm/current.h>
116 #include <linux/audit.h>
117 #include <linux/dmaengine.h>
118 #include <linux/err.h>
119 #include <linux/ctype.h>
120 #include <linux/if_arp.h>
121 #include <linux/if_vlan.h>
122 #include <linux/ip.h>
124 #include <net/mpls.h>
125 #include <linux/ipv6.h>
126 #include <linux/in.h>
127 #include <linux/jhash.h>
128 #include <linux/random.h>
129 #include <trace/events/napi.h>
130 #include <trace/events/net.h>
131 #include <trace/events/skb.h>
132 #include <linux/inetdevice.h>
133 #include <linux/cpu_rmap.h>
134 #include <linux/static_key.h>
135 #include <linux/hashtable.h>
136 #include <linux/vmalloc.h>
137 #include <linux/if_macvlan.h>
138 #include <linux/errqueue.h>
139 #include <linux/hrtimer.h>
140 #include <linux/netfilter_ingress.h>
141 #include <linux/crash_dump.h>
142 #include <linux/sctp.h>
143 #include <net/udp_tunnel.h>
144 #include <linux/net_namespace.h>
145 #include <linux/indirect_call_wrapper.h>
146 #include <net/devlink.h>
147 #include <linux/pm_runtime.h>
148 #include <linux/prandom.h>
150 #include "net-sysfs.h"
152 #define MAX_GRO_SKBS 8
154 /* This should be increased if a protocol with a bigger head is added. */
155 #define GRO_MAX_HEAD (MAX_HEADER + 128)
157 static DEFINE_SPINLOCK(ptype_lock
);
158 static DEFINE_SPINLOCK(offload_lock
);
159 struct list_head ptype_base
[PTYPE_HASH_SIZE
] __read_mostly
;
160 struct list_head ptype_all __read_mostly
; /* Taps */
161 static struct list_head offload_base __read_mostly
;
163 static int netif_rx_internal(struct sk_buff
*skb
);
164 static int call_netdevice_notifiers_info(unsigned long val
,
165 struct netdev_notifier_info
*info
);
166 static int call_netdevice_notifiers_extack(unsigned long val
,
167 struct net_device
*dev
,
168 struct netlink_ext_ack
*extack
);
169 static struct napi_struct
*napi_by_id(unsigned int napi_id
);
172 * The @dev_base_head list is protected by @dev_base_lock and the rtnl
175 * Pure readers hold dev_base_lock for reading, or rcu_read_lock()
177 * Writers must hold the rtnl semaphore while they loop through the
178 * dev_base_head list, and hold dev_base_lock for writing when they do the
179 * actual updates. This allows pure readers to access the list even
180 * while a writer is preparing to update it.
182 * To put it another way, dev_base_lock is held for writing only to
183 * protect against pure readers; the rtnl semaphore provides the
184 * protection against other writers.
186 * See, for example usages, register_netdevice() and
187 * unregister_netdevice(), which must be called with the rtnl
190 DEFINE_RWLOCK(dev_base_lock
);
191 EXPORT_SYMBOL(dev_base_lock
);
193 static DEFINE_MUTEX(ifalias_mutex
);
195 /* protects napi_hash addition/deletion and napi_gen_id */
196 static DEFINE_SPINLOCK(napi_hash_lock
);
198 static unsigned int napi_gen_id
= NR_CPUS
;
199 static DEFINE_READ_MOSTLY_HASHTABLE(napi_hash
, 8);
201 static DECLARE_RWSEM(devnet_rename_sem
);
203 static inline void dev_base_seq_inc(struct net
*net
)
205 while (++net
->dev_base_seq
== 0)
209 static inline struct hlist_head
*dev_name_hash(struct net
*net
, const char *name
)
211 unsigned int hash
= full_name_hash(net
, name
, strnlen(name
, IFNAMSIZ
));
213 return &net
->dev_name_head
[hash_32(hash
, NETDEV_HASHBITS
)];
216 static inline struct hlist_head
*dev_index_hash(struct net
*net
, int ifindex
)
218 return &net
->dev_index_head
[ifindex
& (NETDEV_HASHENTRIES
- 1)];
221 static inline void rps_lock(struct softnet_data
*sd
)
224 spin_lock(&sd
->input_pkt_queue
.lock
);
228 static inline void rps_unlock(struct softnet_data
*sd
)
231 spin_unlock(&sd
->input_pkt_queue
.lock
);
235 static struct netdev_name_node
*netdev_name_node_alloc(struct net_device
*dev
,
238 struct netdev_name_node
*name_node
;
240 name_node
= kmalloc(sizeof(*name_node
), GFP_KERNEL
);
243 INIT_HLIST_NODE(&name_node
->hlist
);
244 name_node
->dev
= dev
;
245 name_node
->name
= name
;
249 static struct netdev_name_node
*
250 netdev_name_node_head_alloc(struct net_device
*dev
)
252 struct netdev_name_node
*name_node
;
254 name_node
= netdev_name_node_alloc(dev
, dev
->name
);
257 INIT_LIST_HEAD(&name_node
->list
);
261 static void netdev_name_node_free(struct netdev_name_node
*name_node
)
266 static void netdev_name_node_add(struct net
*net
,
267 struct netdev_name_node
*name_node
)
269 hlist_add_head_rcu(&name_node
->hlist
,
270 dev_name_hash(net
, name_node
->name
));
273 static void netdev_name_node_del(struct netdev_name_node
*name_node
)
275 hlist_del_rcu(&name_node
->hlist
);
278 static struct netdev_name_node
*netdev_name_node_lookup(struct net
*net
,
281 struct hlist_head
*head
= dev_name_hash(net
, name
);
282 struct netdev_name_node
*name_node
;
284 hlist_for_each_entry(name_node
, head
, hlist
)
285 if (!strcmp(name_node
->name
, name
))
290 static struct netdev_name_node
*netdev_name_node_lookup_rcu(struct net
*net
,
293 struct hlist_head
*head
= dev_name_hash(net
, name
);
294 struct netdev_name_node
*name_node
;
296 hlist_for_each_entry_rcu(name_node
, head
, hlist
)
297 if (!strcmp(name_node
->name
, name
))
302 int netdev_name_node_alt_create(struct net_device
*dev
, const char *name
)
304 struct netdev_name_node
*name_node
;
305 struct net
*net
= dev_net(dev
);
307 name_node
= netdev_name_node_lookup(net
, name
);
310 name_node
= netdev_name_node_alloc(dev
, name
);
313 netdev_name_node_add(net
, name_node
);
314 /* The node that holds dev->name acts as a head of per-device list. */
315 list_add_tail(&name_node
->list
, &dev
->name_node
->list
);
319 EXPORT_SYMBOL(netdev_name_node_alt_create
);
321 static void __netdev_name_node_alt_destroy(struct netdev_name_node
*name_node
)
323 list_del(&name_node
->list
);
324 netdev_name_node_del(name_node
);
325 kfree(name_node
->name
);
326 netdev_name_node_free(name_node
);
329 int netdev_name_node_alt_destroy(struct net_device
*dev
, const char *name
)
331 struct netdev_name_node
*name_node
;
332 struct net
*net
= dev_net(dev
);
334 name_node
= netdev_name_node_lookup(net
, name
);
337 /* lookup might have found our primary name or a name belonging
340 if (name_node
== dev
->name_node
|| name_node
->dev
!= dev
)
343 __netdev_name_node_alt_destroy(name_node
);
347 EXPORT_SYMBOL(netdev_name_node_alt_destroy
);
349 static void netdev_name_node_alt_flush(struct net_device
*dev
)
351 struct netdev_name_node
*name_node
, *tmp
;
353 list_for_each_entry_safe(name_node
, tmp
, &dev
->name_node
->list
, list
)
354 __netdev_name_node_alt_destroy(name_node
);
357 /* Device list insertion */
358 static void list_netdevice(struct net_device
*dev
)
360 struct net
*net
= dev_net(dev
);
364 write_lock_bh(&dev_base_lock
);
365 list_add_tail_rcu(&dev
->dev_list
, &net
->dev_base_head
);
366 netdev_name_node_add(net
, dev
->name_node
);
367 hlist_add_head_rcu(&dev
->index_hlist
,
368 dev_index_hash(net
, dev
->ifindex
));
369 write_unlock_bh(&dev_base_lock
);
371 dev_base_seq_inc(net
);
374 /* Device list removal
375 * caller must respect a RCU grace period before freeing/reusing dev
377 static void unlist_netdevice(struct net_device
*dev
)
381 /* Unlink dev from the device chain */
382 write_lock_bh(&dev_base_lock
);
383 list_del_rcu(&dev
->dev_list
);
384 netdev_name_node_del(dev
->name_node
);
385 hlist_del_rcu(&dev
->index_hlist
);
386 write_unlock_bh(&dev_base_lock
);
388 dev_base_seq_inc(dev_net(dev
));
395 static RAW_NOTIFIER_HEAD(netdev_chain
);
398 * Device drivers call our routines to queue packets here. We empty the
399 * queue in the local softnet handler.
402 DEFINE_PER_CPU_ALIGNED(struct softnet_data
, softnet_data
);
403 EXPORT_PER_CPU_SYMBOL(softnet_data
);
405 #ifdef CONFIG_LOCKDEP
407 * register_netdevice() inits txq->_xmit_lock and sets lockdep class
408 * according to dev->type
410 static const unsigned short netdev_lock_type
[] = {
411 ARPHRD_NETROM
, ARPHRD_ETHER
, ARPHRD_EETHER
, ARPHRD_AX25
,
412 ARPHRD_PRONET
, ARPHRD_CHAOS
, ARPHRD_IEEE802
, ARPHRD_ARCNET
,
413 ARPHRD_APPLETLK
, ARPHRD_DLCI
, ARPHRD_ATM
, ARPHRD_METRICOM
,
414 ARPHRD_IEEE1394
, ARPHRD_EUI64
, ARPHRD_INFINIBAND
, ARPHRD_SLIP
,
415 ARPHRD_CSLIP
, ARPHRD_SLIP6
, ARPHRD_CSLIP6
, ARPHRD_RSRVD
,
416 ARPHRD_ADAPT
, ARPHRD_ROSE
, ARPHRD_X25
, ARPHRD_HWX25
,
417 ARPHRD_PPP
, ARPHRD_CISCO
, ARPHRD_LAPB
, ARPHRD_DDCMP
,
418 ARPHRD_RAWHDLC
, ARPHRD_TUNNEL
, ARPHRD_TUNNEL6
, ARPHRD_FRAD
,
419 ARPHRD_SKIP
, ARPHRD_LOOPBACK
, ARPHRD_LOCALTLK
, ARPHRD_FDDI
,
420 ARPHRD_BIF
, ARPHRD_SIT
, ARPHRD_IPDDP
, ARPHRD_IPGRE
,
421 ARPHRD_PIMREG
, ARPHRD_HIPPI
, ARPHRD_ASH
, ARPHRD_ECONET
,
422 ARPHRD_IRDA
, ARPHRD_FCPP
, ARPHRD_FCAL
, ARPHRD_FCPL
,
423 ARPHRD_FCFABRIC
, ARPHRD_IEEE80211
, ARPHRD_IEEE80211_PRISM
,
424 ARPHRD_IEEE80211_RADIOTAP
, ARPHRD_PHONET
, ARPHRD_PHONET_PIPE
,
425 ARPHRD_IEEE802154
, ARPHRD_VOID
, ARPHRD_NONE
};
427 static const char *const netdev_lock_name
[] = {
428 "_xmit_NETROM", "_xmit_ETHER", "_xmit_EETHER", "_xmit_AX25",
429 "_xmit_PRONET", "_xmit_CHAOS", "_xmit_IEEE802", "_xmit_ARCNET",
430 "_xmit_APPLETLK", "_xmit_DLCI", "_xmit_ATM", "_xmit_METRICOM",
431 "_xmit_IEEE1394", "_xmit_EUI64", "_xmit_INFINIBAND", "_xmit_SLIP",
432 "_xmit_CSLIP", "_xmit_SLIP6", "_xmit_CSLIP6", "_xmit_RSRVD",
433 "_xmit_ADAPT", "_xmit_ROSE", "_xmit_X25", "_xmit_HWX25",
434 "_xmit_PPP", "_xmit_CISCO", "_xmit_LAPB", "_xmit_DDCMP",
435 "_xmit_RAWHDLC", "_xmit_TUNNEL", "_xmit_TUNNEL6", "_xmit_FRAD",
436 "_xmit_SKIP", "_xmit_LOOPBACK", "_xmit_LOCALTLK", "_xmit_FDDI",
437 "_xmit_BIF", "_xmit_SIT", "_xmit_IPDDP", "_xmit_IPGRE",
438 "_xmit_PIMREG", "_xmit_HIPPI", "_xmit_ASH", "_xmit_ECONET",
439 "_xmit_IRDA", "_xmit_FCPP", "_xmit_FCAL", "_xmit_FCPL",
440 "_xmit_FCFABRIC", "_xmit_IEEE80211", "_xmit_IEEE80211_PRISM",
441 "_xmit_IEEE80211_RADIOTAP", "_xmit_PHONET", "_xmit_PHONET_PIPE",
442 "_xmit_IEEE802154", "_xmit_VOID", "_xmit_NONE"};
444 static struct lock_class_key netdev_xmit_lock_key
[ARRAY_SIZE(netdev_lock_type
)];
445 static struct lock_class_key netdev_addr_lock_key
[ARRAY_SIZE(netdev_lock_type
)];
447 static inline unsigned short netdev_lock_pos(unsigned short dev_type
)
451 for (i
= 0; i
< ARRAY_SIZE(netdev_lock_type
); i
++)
452 if (netdev_lock_type
[i
] == dev_type
)
454 /* the last key is used by default */
455 return ARRAY_SIZE(netdev_lock_type
) - 1;
458 static inline void netdev_set_xmit_lockdep_class(spinlock_t
*lock
,
459 unsigned short dev_type
)
463 i
= netdev_lock_pos(dev_type
);
464 lockdep_set_class_and_name(lock
, &netdev_xmit_lock_key
[i
],
465 netdev_lock_name
[i
]);
468 static inline void netdev_set_addr_lockdep_class(struct net_device
*dev
)
472 i
= netdev_lock_pos(dev
->type
);
473 lockdep_set_class_and_name(&dev
->addr_list_lock
,
474 &netdev_addr_lock_key
[i
],
475 netdev_lock_name
[i
]);
478 static inline void netdev_set_xmit_lockdep_class(spinlock_t
*lock
,
479 unsigned short dev_type
)
483 static inline void netdev_set_addr_lockdep_class(struct net_device
*dev
)
488 /*******************************************************************************
490 * Protocol management and registration routines
492 *******************************************************************************/
496 * Add a protocol ID to the list. Now that the input handler is
497 * smarter we can dispense with all the messy stuff that used to be
500 * BEWARE!!! Protocol handlers, mangling input packets,
501 * MUST BE last in hash buckets and checking protocol handlers
502 * MUST start from promiscuous ptype_all chain in net_bh.
503 * It is true now, do not change it.
504 * Explanation follows: if protocol handler, mangling packet, will
505 * be the first on list, it is not able to sense, that packet
506 * is cloned and should be copied-on-write, so that it will
507 * change it and subsequent readers will get broken packet.
511 static inline struct list_head
*ptype_head(const struct packet_type
*pt
)
513 if (pt
->type
== htons(ETH_P_ALL
))
514 return pt
->dev
? &pt
->dev
->ptype_all
: &ptype_all
;
516 return pt
->dev
? &pt
->dev
->ptype_specific
:
517 &ptype_base
[ntohs(pt
->type
) & PTYPE_HASH_MASK
];
521 * dev_add_pack - add packet handler
522 * @pt: packet type declaration
524 * Add a protocol handler to the networking stack. The passed &packet_type
525 * is linked into kernel lists and may not be freed until it has been
526 * removed from the kernel lists.
528 * This call does not sleep therefore it can not
529 * guarantee all CPU's that are in middle of receiving packets
530 * will see the new packet type (until the next received packet).
533 void dev_add_pack(struct packet_type
*pt
)
535 struct list_head
*head
= ptype_head(pt
);
537 spin_lock(&ptype_lock
);
538 list_add_rcu(&pt
->list
, head
);
539 spin_unlock(&ptype_lock
);
541 EXPORT_SYMBOL(dev_add_pack
);
544 * __dev_remove_pack - remove packet handler
545 * @pt: packet type declaration
547 * Remove a protocol handler that was previously added to the kernel
548 * protocol handlers by dev_add_pack(). The passed &packet_type is removed
549 * from the kernel lists and can be freed or reused once this function
552 * The packet type might still be in use by receivers
553 * and must not be freed until after all the CPU's have gone
554 * through a quiescent state.
556 void __dev_remove_pack(struct packet_type
*pt
)
558 struct list_head
*head
= ptype_head(pt
);
559 struct packet_type
*pt1
;
561 spin_lock(&ptype_lock
);
563 list_for_each_entry(pt1
, head
, list
) {
565 list_del_rcu(&pt
->list
);
570 pr_warn("dev_remove_pack: %p not found\n", pt
);
572 spin_unlock(&ptype_lock
);
574 EXPORT_SYMBOL(__dev_remove_pack
);
577 * dev_remove_pack - remove packet handler
578 * @pt: packet type declaration
580 * Remove a protocol handler that was previously added to the kernel
581 * protocol handlers by dev_add_pack(). The passed &packet_type is removed
582 * from the kernel lists and can be freed or reused once this function
585 * This call sleeps to guarantee that no CPU is looking at the packet
588 void dev_remove_pack(struct packet_type
*pt
)
590 __dev_remove_pack(pt
);
594 EXPORT_SYMBOL(dev_remove_pack
);
598 * dev_add_offload - register offload handlers
599 * @po: protocol offload declaration
601 * Add protocol offload handlers to the networking stack. The passed
602 * &proto_offload is linked into kernel lists and may not be freed until
603 * it has been removed from the kernel lists.
605 * This call does not sleep therefore it can not
606 * guarantee all CPU's that are in middle of receiving packets
607 * will see the new offload handlers (until the next received packet).
609 void dev_add_offload(struct packet_offload
*po
)
611 struct packet_offload
*elem
;
613 spin_lock(&offload_lock
);
614 list_for_each_entry(elem
, &offload_base
, list
) {
615 if (po
->priority
< elem
->priority
)
618 list_add_rcu(&po
->list
, elem
->list
.prev
);
619 spin_unlock(&offload_lock
);
621 EXPORT_SYMBOL(dev_add_offload
);
624 * __dev_remove_offload - remove offload handler
625 * @po: packet offload declaration
627 * Remove a protocol offload handler that was previously added to the
628 * kernel offload handlers by dev_add_offload(). The passed &offload_type
629 * is removed from the kernel lists and can be freed or reused once this
632 * The packet type might still be in use by receivers
633 * and must not be freed until after all the CPU's have gone
634 * through a quiescent state.
636 static void __dev_remove_offload(struct packet_offload
*po
)
638 struct list_head
*head
= &offload_base
;
639 struct packet_offload
*po1
;
641 spin_lock(&offload_lock
);
643 list_for_each_entry(po1
, head
, list
) {
645 list_del_rcu(&po
->list
);
650 pr_warn("dev_remove_offload: %p not found\n", po
);
652 spin_unlock(&offload_lock
);
656 * dev_remove_offload - remove packet offload handler
657 * @po: packet offload declaration
659 * Remove a packet offload handler that was previously added to the kernel
660 * offload handlers by dev_add_offload(). The passed &offload_type is
661 * removed from the kernel lists and can be freed or reused once this
664 * This call sleeps to guarantee that no CPU is looking at the packet
667 void dev_remove_offload(struct packet_offload
*po
)
669 __dev_remove_offload(po
);
673 EXPORT_SYMBOL(dev_remove_offload
);
675 /******************************************************************************
677 * Device Boot-time Settings Routines
679 ******************************************************************************/
681 /* Boot time configuration table */
682 static struct netdev_boot_setup dev_boot_setup
[NETDEV_BOOT_SETUP_MAX
];
685 * netdev_boot_setup_add - add new setup entry
686 * @name: name of the device
687 * @map: configured settings for the device
689 * Adds new setup entry to the dev_boot_setup list. The function
690 * returns 0 on error and 1 on success. This is a generic routine to
693 static int netdev_boot_setup_add(char *name
, struct ifmap
*map
)
695 struct netdev_boot_setup
*s
;
699 for (i
= 0; i
< NETDEV_BOOT_SETUP_MAX
; i
++) {
700 if (s
[i
].name
[0] == '\0' || s
[i
].name
[0] == ' ') {
701 memset(s
[i
].name
, 0, sizeof(s
[i
].name
));
702 strlcpy(s
[i
].name
, name
, IFNAMSIZ
);
703 memcpy(&s
[i
].map
, map
, sizeof(s
[i
].map
));
708 return i
>= NETDEV_BOOT_SETUP_MAX
? 0 : 1;
712 * netdev_boot_setup_check - check boot time settings
713 * @dev: the netdevice
715 * Check boot time settings for the device.
716 * The found settings are set for the device to be used
717 * later in the device probing.
718 * Returns 0 if no settings found, 1 if they are.
720 int netdev_boot_setup_check(struct net_device
*dev
)
722 struct netdev_boot_setup
*s
= dev_boot_setup
;
725 for (i
= 0; i
< NETDEV_BOOT_SETUP_MAX
; i
++) {
726 if (s
[i
].name
[0] != '\0' && s
[i
].name
[0] != ' ' &&
727 !strcmp(dev
->name
, s
[i
].name
)) {
728 dev
->irq
= s
[i
].map
.irq
;
729 dev
->base_addr
= s
[i
].map
.base_addr
;
730 dev
->mem_start
= s
[i
].map
.mem_start
;
731 dev
->mem_end
= s
[i
].map
.mem_end
;
737 EXPORT_SYMBOL(netdev_boot_setup_check
);
741 * netdev_boot_base - get address from boot time settings
742 * @prefix: prefix for network device
743 * @unit: id for network device
745 * Check boot time settings for the base address of device.
746 * The found settings are set for the device to be used
747 * later in the device probing.
748 * Returns 0 if no settings found.
750 unsigned long netdev_boot_base(const char *prefix
, int unit
)
752 const struct netdev_boot_setup
*s
= dev_boot_setup
;
756 sprintf(name
, "%s%d", prefix
, unit
);
759 * If device already registered then return base of 1
760 * to indicate not to probe for this interface
762 if (__dev_get_by_name(&init_net
, name
))
765 for (i
= 0; i
< NETDEV_BOOT_SETUP_MAX
; i
++)
766 if (!strcmp(name
, s
[i
].name
))
767 return s
[i
].map
.base_addr
;
772 * Saves at boot time configured settings for any netdevice.
774 int __init
netdev_boot_setup(char *str
)
779 str
= get_options(str
, ARRAY_SIZE(ints
), ints
);
784 memset(&map
, 0, sizeof(map
));
788 map
.base_addr
= ints
[2];
790 map
.mem_start
= ints
[3];
792 map
.mem_end
= ints
[4];
794 /* Add new entry to the list */
795 return netdev_boot_setup_add(str
, &map
);
798 __setup("netdev=", netdev_boot_setup
);
800 /*******************************************************************************
802 * Device Interface Subroutines
804 *******************************************************************************/
807 * dev_get_iflink - get 'iflink' value of a interface
808 * @dev: targeted interface
810 * Indicates the ifindex the interface is linked to.
811 * Physical interfaces have the same 'ifindex' and 'iflink' values.
814 int dev_get_iflink(const struct net_device
*dev
)
816 if (dev
->netdev_ops
&& dev
->netdev_ops
->ndo_get_iflink
)
817 return dev
->netdev_ops
->ndo_get_iflink(dev
);
821 EXPORT_SYMBOL(dev_get_iflink
);
824 * dev_fill_metadata_dst - Retrieve tunnel egress information.
825 * @dev: targeted interface
828 * For better visibility of tunnel traffic OVS needs to retrieve
829 * egress tunnel information for a packet. Following API allows
830 * user to get this info.
832 int dev_fill_metadata_dst(struct net_device
*dev
, struct sk_buff
*skb
)
834 struct ip_tunnel_info
*info
;
836 if (!dev
->netdev_ops
|| !dev
->netdev_ops
->ndo_fill_metadata_dst
)
839 info
= skb_tunnel_info_unclone(skb
);
842 if (unlikely(!(info
->mode
& IP_TUNNEL_INFO_TX
)))
845 return dev
->netdev_ops
->ndo_fill_metadata_dst(dev
, skb
);
847 EXPORT_SYMBOL_GPL(dev_fill_metadata_dst
);
850 * __dev_get_by_name - find a device by its name
851 * @net: the applicable net namespace
852 * @name: name to find
854 * Find an interface by name. Must be called under RTNL semaphore
855 * or @dev_base_lock. If the name is found a pointer to the device
856 * is returned. If the name is not found then %NULL is returned. The
857 * reference counters are not incremented so the caller must be
858 * careful with locks.
861 struct net_device
*__dev_get_by_name(struct net
*net
, const char *name
)
863 struct netdev_name_node
*node_name
;
865 node_name
= netdev_name_node_lookup(net
, name
);
866 return node_name
? node_name
->dev
: NULL
;
868 EXPORT_SYMBOL(__dev_get_by_name
);
871 * dev_get_by_name_rcu - find a device by its name
872 * @net: the applicable net namespace
873 * @name: name to find
875 * Find an interface by name.
876 * If the name is found a pointer to the device is returned.
877 * If the name is not found then %NULL is returned.
878 * The reference counters are not incremented so the caller must be
879 * careful with locks. The caller must hold RCU lock.
882 struct net_device
*dev_get_by_name_rcu(struct net
*net
, const char *name
)
884 struct netdev_name_node
*node_name
;
886 node_name
= netdev_name_node_lookup_rcu(net
, name
);
887 return node_name
? node_name
->dev
: NULL
;
889 EXPORT_SYMBOL(dev_get_by_name_rcu
);
892 * dev_get_by_name - find a device by its name
893 * @net: the applicable net namespace
894 * @name: name to find
896 * Find an interface by name. This can be called from any
897 * context and does its own locking. The returned handle has
898 * the usage count incremented and the caller must use dev_put() to
899 * release it when it is no longer needed. %NULL is returned if no
900 * matching device is found.
903 struct net_device
*dev_get_by_name(struct net
*net
, const char *name
)
905 struct net_device
*dev
;
908 dev
= dev_get_by_name_rcu(net
, name
);
914 EXPORT_SYMBOL(dev_get_by_name
);
917 * __dev_get_by_index - find a device by its ifindex
918 * @net: the applicable net namespace
919 * @ifindex: index of device
921 * Search for an interface by index. Returns %NULL if the device
922 * is not found or a pointer to the device. The device has not
923 * had its reference counter increased so the caller must be careful
924 * about locking. The caller must hold either the RTNL semaphore
928 struct net_device
*__dev_get_by_index(struct net
*net
, int ifindex
)
930 struct net_device
*dev
;
931 struct hlist_head
*head
= dev_index_hash(net
, ifindex
);
933 hlist_for_each_entry(dev
, head
, index_hlist
)
934 if (dev
->ifindex
== ifindex
)
939 EXPORT_SYMBOL(__dev_get_by_index
);
942 * dev_get_by_index_rcu - find a device by its ifindex
943 * @net: the applicable net namespace
944 * @ifindex: index of device
946 * Search for an interface by index. Returns %NULL if the device
947 * is not found or a pointer to the device. The device has not
948 * had its reference counter increased so the caller must be careful
949 * about locking. The caller must hold RCU lock.
952 struct net_device
*dev_get_by_index_rcu(struct net
*net
, int ifindex
)
954 struct net_device
*dev
;
955 struct hlist_head
*head
= dev_index_hash(net
, ifindex
);
957 hlist_for_each_entry_rcu(dev
, head
, index_hlist
)
958 if (dev
->ifindex
== ifindex
)
963 EXPORT_SYMBOL(dev_get_by_index_rcu
);
967 * dev_get_by_index - find a device by its ifindex
968 * @net: the applicable net namespace
969 * @ifindex: index of device
971 * Search for an interface by index. Returns NULL if the device
972 * is not found or a pointer to the device. The device returned has
973 * had a reference added and the pointer is safe until the user calls
974 * dev_put to indicate they have finished with it.
977 struct net_device
*dev_get_by_index(struct net
*net
, int ifindex
)
979 struct net_device
*dev
;
982 dev
= dev_get_by_index_rcu(net
, ifindex
);
988 EXPORT_SYMBOL(dev_get_by_index
);
991 * dev_get_by_napi_id - find a device by napi_id
992 * @napi_id: ID of the NAPI struct
994 * Search for an interface by NAPI ID. Returns %NULL if the device
995 * is not found or a pointer to the device. The device has not had
996 * its reference counter increased so the caller must be careful
997 * about locking. The caller must hold RCU lock.
1000 struct net_device
*dev_get_by_napi_id(unsigned int napi_id
)
1002 struct napi_struct
*napi
;
1004 WARN_ON_ONCE(!rcu_read_lock_held());
1006 if (napi_id
< MIN_NAPI_ID
)
1009 napi
= napi_by_id(napi_id
);
1011 return napi
? napi
->dev
: NULL
;
1013 EXPORT_SYMBOL(dev_get_by_napi_id
);
1016 * netdev_get_name - get a netdevice name, knowing its ifindex.
1017 * @net: network namespace
1018 * @name: a pointer to the buffer where the name will be stored.
1019 * @ifindex: the ifindex of the interface to get the name from.
1021 int netdev_get_name(struct net
*net
, char *name
, int ifindex
)
1023 struct net_device
*dev
;
1026 down_read(&devnet_rename_sem
);
1029 dev
= dev_get_by_index_rcu(net
, ifindex
);
1035 strcpy(name
, dev
->name
);
1040 up_read(&devnet_rename_sem
);
1045 * dev_getbyhwaddr_rcu - find a device by its hardware address
1046 * @net: the applicable net namespace
1047 * @type: media type of device
1048 * @ha: hardware address
1050 * Search for an interface by MAC address. Returns NULL if the device
1051 * is not found or a pointer to the device.
1052 * The caller must hold RCU or RTNL.
1053 * The returned device has not had its ref count increased
1054 * and the caller must therefore be careful about locking
1058 struct net_device
*dev_getbyhwaddr_rcu(struct net
*net
, unsigned short type
,
1061 struct net_device
*dev
;
1063 for_each_netdev_rcu(net
, dev
)
1064 if (dev
->type
== type
&&
1065 !memcmp(dev
->dev_addr
, ha
, dev
->addr_len
))
1070 EXPORT_SYMBOL(dev_getbyhwaddr_rcu
);
1072 struct net_device
*dev_getfirstbyhwtype(struct net
*net
, unsigned short type
)
1074 struct net_device
*dev
, *ret
= NULL
;
1077 for_each_netdev_rcu(net
, dev
)
1078 if (dev
->type
== type
) {
1086 EXPORT_SYMBOL(dev_getfirstbyhwtype
);
1089 * __dev_get_by_flags - find any device with given flags
1090 * @net: the applicable net namespace
1091 * @if_flags: IFF_* values
1092 * @mask: bitmask of bits in if_flags to check
1094 * Search for any interface with the given flags. Returns NULL if a device
1095 * is not found or a pointer to the device. Must be called inside
1096 * rtnl_lock(), and result refcount is unchanged.
1099 struct net_device
*__dev_get_by_flags(struct net
*net
, unsigned short if_flags
,
1100 unsigned short mask
)
1102 struct net_device
*dev
, *ret
;
1107 for_each_netdev(net
, dev
) {
1108 if (((dev
->flags
^ if_flags
) & mask
) == 0) {
1115 EXPORT_SYMBOL(__dev_get_by_flags
);
1118 * dev_valid_name - check if name is okay for network device
1119 * @name: name string
1121 * Network device names need to be valid file names to
1122 * allow sysfs to work. We also disallow any kind of
1125 bool dev_valid_name(const char *name
)
1129 if (strnlen(name
, IFNAMSIZ
) == IFNAMSIZ
)
1131 if (!strcmp(name
, ".") || !strcmp(name
, ".."))
1135 if (*name
== '/' || *name
== ':' || isspace(*name
))
1141 EXPORT_SYMBOL(dev_valid_name
);
1144 * __dev_alloc_name - allocate a name for a device
1145 * @net: network namespace to allocate the device name in
1146 * @name: name format string
1147 * @buf: scratch buffer and result name string
1149 * Passed a format string - eg "lt%d" it will try and find a suitable
1150 * id. It scans list of devices to build up a free map, then chooses
1151 * the first empty slot. The caller must hold the dev_base or rtnl lock
1152 * while allocating the name and adding the device in order to avoid
1154 * Limited to bits_per_byte * page size devices (ie 32K on most platforms).
1155 * Returns the number of the unit assigned or a negative errno code.
1158 static int __dev_alloc_name(struct net
*net
, const char *name
, char *buf
)
1162 const int max_netdevices
= 8*PAGE_SIZE
;
1163 unsigned long *inuse
;
1164 struct net_device
*d
;
1166 if (!dev_valid_name(name
))
1169 p
= strchr(name
, '%');
1172 * Verify the string as this thing may have come from
1173 * the user. There must be either one "%d" and no other "%"
1176 if (p
[1] != 'd' || strchr(p
+ 2, '%'))
1179 /* Use one page as a bit array of possible slots */
1180 inuse
= (unsigned long *) get_zeroed_page(GFP_ATOMIC
);
1184 for_each_netdev(net
, d
) {
1185 if (!sscanf(d
->name
, name
, &i
))
1187 if (i
< 0 || i
>= max_netdevices
)
1190 /* avoid cases where sscanf is not exact inverse of printf */
1191 snprintf(buf
, IFNAMSIZ
, name
, i
);
1192 if (!strncmp(buf
, d
->name
, IFNAMSIZ
))
1196 i
= find_first_zero_bit(inuse
, max_netdevices
);
1197 free_page((unsigned long) inuse
);
1200 snprintf(buf
, IFNAMSIZ
, name
, i
);
1201 if (!__dev_get_by_name(net
, buf
))
1204 /* It is possible to run out of possible slots
1205 * when the name is long and there isn't enough space left
1206 * for the digits, or if all bits are used.
1211 static int dev_alloc_name_ns(struct net
*net
,
1212 struct net_device
*dev
,
1219 ret
= __dev_alloc_name(net
, name
, buf
);
1221 strlcpy(dev
->name
, buf
, IFNAMSIZ
);
1226 * dev_alloc_name - allocate a name for a device
1228 * @name: name format string
1230 * Passed a format string - eg "lt%d" it will try and find a suitable
1231 * id. It scans list of devices to build up a free map, then chooses
1232 * the first empty slot. The caller must hold the dev_base or rtnl lock
1233 * while allocating the name and adding the device in order to avoid
1235 * Limited to bits_per_byte * page size devices (ie 32K on most platforms).
1236 * Returns the number of the unit assigned or a negative errno code.
1239 int dev_alloc_name(struct net_device
*dev
, const char *name
)
1241 return dev_alloc_name_ns(dev_net(dev
), dev
, name
);
1243 EXPORT_SYMBOL(dev_alloc_name
);
1245 static int dev_get_valid_name(struct net
*net
, struct net_device
*dev
,
1250 if (!dev_valid_name(name
))
1253 if (strchr(name
, '%'))
1254 return dev_alloc_name_ns(net
, dev
, name
);
1255 else if (__dev_get_by_name(net
, name
))
1257 else if (dev
->name
!= name
)
1258 strlcpy(dev
->name
, name
, IFNAMSIZ
);
1264 * dev_change_name - change name of a device
1266 * @newname: name (or format string) must be at least IFNAMSIZ
1268 * Change name of a device, can pass format strings "eth%d".
1271 int dev_change_name(struct net_device
*dev
, const char *newname
)
1273 unsigned char old_assign_type
;
1274 char oldname
[IFNAMSIZ
];
1280 BUG_ON(!dev_net(dev
));
1284 /* Some auto-enslaved devices e.g. failover slaves are
1285 * special, as userspace might rename the device after
1286 * the interface had been brought up and running since
1287 * the point kernel initiated auto-enslavement. Allow
1288 * live name change even when these slave devices are
1291 * Typically, users of these auto-enslaving devices
1292 * don't actually care about slave name change, as
1293 * they are supposed to operate on master interface
1296 if (dev
->flags
& IFF_UP
&&
1297 likely(!(dev
->priv_flags
& IFF_LIVE_RENAME_OK
)))
1300 down_write(&devnet_rename_sem
);
1302 if (strncmp(newname
, dev
->name
, IFNAMSIZ
) == 0) {
1303 up_write(&devnet_rename_sem
);
1307 memcpy(oldname
, dev
->name
, IFNAMSIZ
);
1309 err
= dev_get_valid_name(net
, dev
, newname
);
1311 up_write(&devnet_rename_sem
);
1315 if (oldname
[0] && !strchr(oldname
, '%'))
1316 netdev_info(dev
, "renamed from %s\n", oldname
);
1318 old_assign_type
= dev
->name_assign_type
;
1319 dev
->name_assign_type
= NET_NAME_RENAMED
;
1322 ret
= device_rename(&dev
->dev
, dev
->name
);
1324 memcpy(dev
->name
, oldname
, IFNAMSIZ
);
1325 dev
->name_assign_type
= old_assign_type
;
1326 up_write(&devnet_rename_sem
);
1330 up_write(&devnet_rename_sem
);
1332 netdev_adjacent_rename_links(dev
, oldname
);
1334 write_lock_bh(&dev_base_lock
);
1335 netdev_name_node_del(dev
->name_node
);
1336 write_unlock_bh(&dev_base_lock
);
1340 write_lock_bh(&dev_base_lock
);
1341 netdev_name_node_add(net
, dev
->name_node
);
1342 write_unlock_bh(&dev_base_lock
);
1344 ret
= call_netdevice_notifiers(NETDEV_CHANGENAME
, dev
);
1345 ret
= notifier_to_errno(ret
);
1348 /* err >= 0 after dev_alloc_name() or stores the first errno */
1351 down_write(&devnet_rename_sem
);
1352 memcpy(dev
->name
, oldname
, IFNAMSIZ
);
1353 memcpy(oldname
, newname
, IFNAMSIZ
);
1354 dev
->name_assign_type
= old_assign_type
;
1355 old_assign_type
= NET_NAME_RENAMED
;
1358 pr_err("%s: name change rollback failed: %d\n",
1367 * dev_set_alias - change ifalias of a device
1369 * @alias: name up to IFALIASZ
1370 * @len: limit of bytes to copy from info
1372 * Set ifalias for a device,
1374 int dev_set_alias(struct net_device
*dev
, const char *alias
, size_t len
)
1376 struct dev_ifalias
*new_alias
= NULL
;
1378 if (len
>= IFALIASZ
)
1382 new_alias
= kmalloc(sizeof(*new_alias
) + len
+ 1, GFP_KERNEL
);
1386 memcpy(new_alias
->ifalias
, alias
, len
);
1387 new_alias
->ifalias
[len
] = 0;
1390 mutex_lock(&ifalias_mutex
);
1391 new_alias
= rcu_replace_pointer(dev
->ifalias
, new_alias
,
1392 mutex_is_locked(&ifalias_mutex
));
1393 mutex_unlock(&ifalias_mutex
);
1396 kfree_rcu(new_alias
, rcuhead
);
1400 EXPORT_SYMBOL(dev_set_alias
);
1403 * dev_get_alias - get ifalias of a device
1405 * @name: buffer to store name of ifalias
1406 * @len: size of buffer
1408 * get ifalias for a device. Caller must make sure dev cannot go
1409 * away, e.g. rcu read lock or own a reference count to device.
1411 int dev_get_alias(const struct net_device
*dev
, char *name
, size_t len
)
1413 const struct dev_ifalias
*alias
;
1417 alias
= rcu_dereference(dev
->ifalias
);
1419 ret
= snprintf(name
, len
, "%s", alias
->ifalias
);
1426 * netdev_features_change - device changes features
1427 * @dev: device to cause notification
1429 * Called to indicate a device has changed features.
1431 void netdev_features_change(struct net_device
*dev
)
1433 call_netdevice_notifiers(NETDEV_FEAT_CHANGE
, dev
);
1435 EXPORT_SYMBOL(netdev_features_change
);
1438 * netdev_state_change - device changes state
1439 * @dev: device to cause notification
1441 * Called to indicate a device has changed state. This function calls
1442 * the notifier chains for netdev_chain and sends a NEWLINK message
1443 * to the routing socket.
1445 void netdev_state_change(struct net_device
*dev
)
1447 if (dev
->flags
& IFF_UP
) {
1448 struct netdev_notifier_change_info change_info
= {
1452 call_netdevice_notifiers_info(NETDEV_CHANGE
,
1454 rtmsg_ifinfo(RTM_NEWLINK
, dev
, 0, GFP_KERNEL
);
1457 EXPORT_SYMBOL(netdev_state_change
);
1460 * __netdev_notify_peers - notify network peers about existence of @dev,
1461 * to be called when rtnl lock is already held.
1462 * @dev: network device
1464 * Generate traffic such that interested network peers are aware of
1465 * @dev, such as by generating a gratuitous ARP. This may be used when
1466 * a device wants to inform the rest of the network about some sort of
1467 * reconfiguration such as a failover event or virtual machine
1470 void __netdev_notify_peers(struct net_device
*dev
)
1473 call_netdevice_notifiers(NETDEV_NOTIFY_PEERS
, dev
);
1474 call_netdevice_notifiers(NETDEV_RESEND_IGMP
, dev
);
1476 EXPORT_SYMBOL(__netdev_notify_peers
);
1479 * netdev_notify_peers - notify network peers about existence of @dev
1480 * @dev: network device
1482 * Generate traffic such that interested network peers are aware of
1483 * @dev, such as by generating a gratuitous ARP. This may be used when
1484 * a device wants to inform the rest of the network about some sort of
1485 * reconfiguration such as a failover event or virtual machine
1488 void netdev_notify_peers(struct net_device
*dev
)
1491 __netdev_notify_peers(dev
);
1494 EXPORT_SYMBOL(netdev_notify_peers
);
1496 static int __dev_open(struct net_device
*dev
, struct netlink_ext_ack
*extack
)
1498 const struct net_device_ops
*ops
= dev
->netdev_ops
;
1503 if (!netif_device_present(dev
)) {
1504 /* may be detached because parent is runtime-suspended */
1505 if (dev
->dev
.parent
)
1506 pm_runtime_resume(dev
->dev
.parent
);
1507 if (!netif_device_present(dev
))
1511 /* Block netpoll from trying to do any rx path servicing.
1512 * If we don't do this there is a chance ndo_poll_controller
1513 * or ndo_poll may be running while we open the device
1515 netpoll_poll_disable(dev
);
1517 ret
= call_netdevice_notifiers_extack(NETDEV_PRE_UP
, dev
, extack
);
1518 ret
= notifier_to_errno(ret
);
1522 set_bit(__LINK_STATE_START
, &dev
->state
);
1524 if (ops
->ndo_validate_addr
)
1525 ret
= ops
->ndo_validate_addr(dev
);
1527 if (!ret
&& ops
->ndo_open
)
1528 ret
= ops
->ndo_open(dev
);
1530 netpoll_poll_enable(dev
);
1533 clear_bit(__LINK_STATE_START
, &dev
->state
);
1535 dev
->flags
|= IFF_UP
;
1536 dev_set_rx_mode(dev
);
1538 add_device_randomness(dev
->dev_addr
, dev
->addr_len
);
1545 * dev_open - prepare an interface for use.
1546 * @dev: device to open
1547 * @extack: netlink extended ack
1549 * Takes a device from down to up state. The device's private open
1550 * function is invoked and then the multicast lists are loaded. Finally
1551 * the device is moved into the up state and a %NETDEV_UP message is
1552 * sent to the netdev notifier chain.
1554 * Calling this function on an active interface is a nop. On a failure
1555 * a negative errno code is returned.
1557 int dev_open(struct net_device
*dev
, struct netlink_ext_ack
*extack
)
1561 if (dev
->flags
& IFF_UP
)
1564 ret
= __dev_open(dev
, extack
);
1568 rtmsg_ifinfo(RTM_NEWLINK
, dev
, IFF_UP
|IFF_RUNNING
, GFP_KERNEL
);
1569 call_netdevice_notifiers(NETDEV_UP
, dev
);
1573 EXPORT_SYMBOL(dev_open
);
1575 static void __dev_close_many(struct list_head
*head
)
1577 struct net_device
*dev
;
1582 list_for_each_entry(dev
, head
, close_list
) {
1583 /* Temporarily disable netpoll until the interface is down */
1584 netpoll_poll_disable(dev
);
1586 call_netdevice_notifiers(NETDEV_GOING_DOWN
, dev
);
1588 clear_bit(__LINK_STATE_START
, &dev
->state
);
1590 /* Synchronize to scheduled poll. We cannot touch poll list, it
1591 * can be even on different cpu. So just clear netif_running().
1593 * dev->stop() will invoke napi_disable() on all of it's
1594 * napi_struct instances on this device.
1596 smp_mb__after_atomic(); /* Commit netif_running(). */
1599 dev_deactivate_many(head
);
1601 list_for_each_entry(dev
, head
, close_list
) {
1602 const struct net_device_ops
*ops
= dev
->netdev_ops
;
1605 * Call the device specific close. This cannot fail.
1606 * Only if device is UP
1608 * We allow it to be called even after a DETACH hot-plug
1614 dev
->flags
&= ~IFF_UP
;
1615 netpoll_poll_enable(dev
);
1619 static void __dev_close(struct net_device
*dev
)
1623 list_add(&dev
->close_list
, &single
);
1624 __dev_close_many(&single
);
1628 void dev_close_many(struct list_head
*head
, bool unlink
)
1630 struct net_device
*dev
, *tmp
;
1632 /* Remove the devices that don't need to be closed */
1633 list_for_each_entry_safe(dev
, tmp
, head
, close_list
)
1634 if (!(dev
->flags
& IFF_UP
))
1635 list_del_init(&dev
->close_list
);
1637 __dev_close_many(head
);
1639 list_for_each_entry_safe(dev
, tmp
, head
, close_list
) {
1640 rtmsg_ifinfo(RTM_NEWLINK
, dev
, IFF_UP
|IFF_RUNNING
, GFP_KERNEL
);
1641 call_netdevice_notifiers(NETDEV_DOWN
, dev
);
1643 list_del_init(&dev
->close_list
);
1646 EXPORT_SYMBOL(dev_close_many
);
1649 * dev_close - shutdown an interface.
1650 * @dev: device to shutdown
1652 * This function moves an active device into down state. A
1653 * %NETDEV_GOING_DOWN is sent to the netdev notifier chain. The device
1654 * is then deactivated and finally a %NETDEV_DOWN is sent to the notifier
1657 void dev_close(struct net_device
*dev
)
1659 if (dev
->flags
& IFF_UP
) {
1662 list_add(&dev
->close_list
, &single
);
1663 dev_close_many(&single
, true);
1667 EXPORT_SYMBOL(dev_close
);
1671 * dev_disable_lro - disable Large Receive Offload on a device
1674 * Disable Large Receive Offload (LRO) on a net device. Must be
1675 * called under RTNL. This is needed if received packets may be
1676 * forwarded to another interface.
1678 void dev_disable_lro(struct net_device
*dev
)
1680 struct net_device
*lower_dev
;
1681 struct list_head
*iter
;
1683 dev
->wanted_features
&= ~NETIF_F_LRO
;
1684 netdev_update_features(dev
);
1686 if (unlikely(dev
->features
& NETIF_F_LRO
))
1687 netdev_WARN(dev
, "failed to disable LRO!\n");
1689 netdev_for_each_lower_dev(dev
, lower_dev
, iter
)
1690 dev_disable_lro(lower_dev
);
1692 EXPORT_SYMBOL(dev_disable_lro
);
1695 * dev_disable_gro_hw - disable HW Generic Receive Offload on a device
1698 * Disable HW Generic Receive Offload (GRO_HW) on a net device. Must be
1699 * called under RTNL. This is needed if Generic XDP is installed on
1702 static void dev_disable_gro_hw(struct net_device
*dev
)
1704 dev
->wanted_features
&= ~NETIF_F_GRO_HW
;
1705 netdev_update_features(dev
);
1707 if (unlikely(dev
->features
& NETIF_F_GRO_HW
))
1708 netdev_WARN(dev
, "failed to disable GRO_HW!\n");
1711 const char *netdev_cmd_to_name(enum netdev_cmd cmd
)
1714 case NETDEV_##val: \
1715 return "NETDEV_" __stringify(val);
1717 N(UP
) N(DOWN
) N(REBOOT
) N(CHANGE
) N(REGISTER
) N(UNREGISTER
)
1718 N(CHANGEMTU
) N(CHANGEADDR
) N(GOING_DOWN
) N(CHANGENAME
) N(FEAT_CHANGE
)
1719 N(BONDING_FAILOVER
) N(PRE_UP
) N(PRE_TYPE_CHANGE
) N(POST_TYPE_CHANGE
)
1720 N(POST_INIT
) N(RELEASE
) N(NOTIFY_PEERS
) N(JOIN
) N(CHANGEUPPER
)
1721 N(RESEND_IGMP
) N(PRECHANGEMTU
) N(CHANGEINFODATA
) N(BONDING_INFO
)
1722 N(PRECHANGEUPPER
) N(CHANGELOWERSTATE
) N(UDP_TUNNEL_PUSH_INFO
)
1723 N(UDP_TUNNEL_DROP_INFO
) N(CHANGE_TX_QUEUE_LEN
)
1724 N(CVLAN_FILTER_PUSH_INFO
) N(CVLAN_FILTER_DROP_INFO
)
1725 N(SVLAN_FILTER_PUSH_INFO
) N(SVLAN_FILTER_DROP_INFO
)
1729 return "UNKNOWN_NETDEV_EVENT";
1731 EXPORT_SYMBOL_GPL(netdev_cmd_to_name
);
1733 static int call_netdevice_notifier(struct notifier_block
*nb
, unsigned long val
,
1734 struct net_device
*dev
)
1736 struct netdev_notifier_info info
= {
1740 return nb
->notifier_call(nb
, val
, &info
);
1743 static int call_netdevice_register_notifiers(struct notifier_block
*nb
,
1744 struct net_device
*dev
)
1748 err
= call_netdevice_notifier(nb
, NETDEV_REGISTER
, dev
);
1749 err
= notifier_to_errno(err
);
1753 if (!(dev
->flags
& IFF_UP
))
1756 call_netdevice_notifier(nb
, NETDEV_UP
, dev
);
1760 static void call_netdevice_unregister_notifiers(struct notifier_block
*nb
,
1761 struct net_device
*dev
)
1763 if (dev
->flags
& IFF_UP
) {
1764 call_netdevice_notifier(nb
, NETDEV_GOING_DOWN
,
1766 call_netdevice_notifier(nb
, NETDEV_DOWN
, dev
);
1768 call_netdevice_notifier(nb
, NETDEV_UNREGISTER
, dev
);
1771 static int call_netdevice_register_net_notifiers(struct notifier_block
*nb
,
1774 struct net_device
*dev
;
1777 for_each_netdev(net
, dev
) {
1778 err
= call_netdevice_register_notifiers(nb
, dev
);
1785 for_each_netdev_continue_reverse(net
, dev
)
1786 call_netdevice_unregister_notifiers(nb
, dev
);
1790 static void call_netdevice_unregister_net_notifiers(struct notifier_block
*nb
,
1793 struct net_device
*dev
;
1795 for_each_netdev(net
, dev
)
1796 call_netdevice_unregister_notifiers(nb
, dev
);
1799 static int dev_boot_phase
= 1;
1802 * register_netdevice_notifier - register a network notifier block
1805 * Register a notifier to be called when network device events occur.
1806 * The notifier passed is linked into the kernel structures and must
1807 * not be reused until it has been unregistered. A negative errno code
1808 * is returned on a failure.
1810 * When registered all registration and up events are replayed
1811 * to the new notifier to allow device to have a race free
1812 * view of the network device list.
1815 int register_netdevice_notifier(struct notifier_block
*nb
)
1820 /* Close race with setup_net() and cleanup_net() */
1821 down_write(&pernet_ops_rwsem
);
1823 err
= raw_notifier_chain_register(&netdev_chain
, nb
);
1829 err
= call_netdevice_register_net_notifiers(nb
, net
);
1836 up_write(&pernet_ops_rwsem
);
1840 for_each_net_continue_reverse(net
)
1841 call_netdevice_unregister_net_notifiers(nb
, net
);
1843 raw_notifier_chain_unregister(&netdev_chain
, nb
);
1846 EXPORT_SYMBOL(register_netdevice_notifier
);
1849 * unregister_netdevice_notifier - unregister a network notifier block
1852 * Unregister a notifier previously registered by
1853 * register_netdevice_notifier(). The notifier is unlinked into the
1854 * kernel structures and may then be reused. A negative errno code
1855 * is returned on a failure.
1857 * After unregistering unregister and down device events are synthesized
1858 * for all devices on the device list to the removed notifier to remove
1859 * the need for special case cleanup code.
1862 int unregister_netdevice_notifier(struct notifier_block
*nb
)
1867 /* Close race with setup_net() and cleanup_net() */
1868 down_write(&pernet_ops_rwsem
);
1870 err
= raw_notifier_chain_unregister(&netdev_chain
, nb
);
1875 call_netdevice_unregister_net_notifiers(nb
, net
);
1879 up_write(&pernet_ops_rwsem
);
1882 EXPORT_SYMBOL(unregister_netdevice_notifier
);
1884 static int __register_netdevice_notifier_net(struct net
*net
,
1885 struct notifier_block
*nb
,
1886 bool ignore_call_fail
)
1890 err
= raw_notifier_chain_register(&net
->netdev_chain
, nb
);
1896 err
= call_netdevice_register_net_notifiers(nb
, net
);
1897 if (err
&& !ignore_call_fail
)
1898 goto chain_unregister
;
1903 raw_notifier_chain_unregister(&net
->netdev_chain
, nb
);
1907 static int __unregister_netdevice_notifier_net(struct net
*net
,
1908 struct notifier_block
*nb
)
1912 err
= raw_notifier_chain_unregister(&net
->netdev_chain
, nb
);
1916 call_netdevice_unregister_net_notifiers(nb
, net
);
1921 * register_netdevice_notifier_net - register a per-netns network notifier block
1922 * @net: network namespace
1925 * Register a notifier to be called when network device events occur.
1926 * The notifier passed is linked into the kernel structures and must
1927 * not be reused until it has been unregistered. A negative errno code
1928 * is returned on a failure.
1930 * When registered all registration and up events are replayed
1931 * to the new notifier to allow device to have a race free
1932 * view of the network device list.
1935 int register_netdevice_notifier_net(struct net
*net
, struct notifier_block
*nb
)
1940 err
= __register_netdevice_notifier_net(net
, nb
, false);
1944 EXPORT_SYMBOL(register_netdevice_notifier_net
);
1947 * unregister_netdevice_notifier_net - unregister a per-netns
1948 * network notifier block
1949 * @net: network namespace
1952 * Unregister a notifier previously registered by
1953 * register_netdevice_notifier(). The notifier is unlinked into the
1954 * kernel structures and may then be reused. A negative errno code
1955 * is returned on a failure.
1957 * After unregistering unregister and down device events are synthesized
1958 * for all devices on the device list to the removed notifier to remove
1959 * the need for special case cleanup code.
1962 int unregister_netdevice_notifier_net(struct net
*net
,
1963 struct notifier_block
*nb
)
1968 err
= __unregister_netdevice_notifier_net(net
, nb
);
1972 EXPORT_SYMBOL(unregister_netdevice_notifier_net
);
1974 int register_netdevice_notifier_dev_net(struct net_device
*dev
,
1975 struct notifier_block
*nb
,
1976 struct netdev_net_notifier
*nn
)
1981 err
= __register_netdevice_notifier_net(dev_net(dev
), nb
, false);
1984 list_add(&nn
->list
, &dev
->net_notifier_list
);
1989 EXPORT_SYMBOL(register_netdevice_notifier_dev_net
);
1991 int unregister_netdevice_notifier_dev_net(struct net_device
*dev
,
1992 struct notifier_block
*nb
,
1993 struct netdev_net_notifier
*nn
)
1998 list_del(&nn
->list
);
1999 err
= __unregister_netdevice_notifier_net(dev_net(dev
), nb
);
2003 EXPORT_SYMBOL(unregister_netdevice_notifier_dev_net
);
2005 static void move_netdevice_notifiers_dev_net(struct net_device
*dev
,
2008 struct netdev_net_notifier
*nn
;
2010 list_for_each_entry(nn
, &dev
->net_notifier_list
, list
) {
2011 __unregister_netdevice_notifier_net(dev_net(dev
), nn
->nb
);
2012 __register_netdevice_notifier_net(net
, nn
->nb
, true);
2017 * call_netdevice_notifiers_info - call all network notifier blocks
2018 * @val: value passed unmodified to notifier function
2019 * @info: notifier information data
2021 * Call all network notifier blocks. Parameters and return value
2022 * are as for raw_notifier_call_chain().
2025 static int call_netdevice_notifiers_info(unsigned long val
,
2026 struct netdev_notifier_info
*info
)
2028 struct net
*net
= dev_net(info
->dev
);
2033 /* Run per-netns notifier block chain first, then run the global one.
2034 * Hopefully, one day, the global one is going to be removed after
2035 * all notifier block registrators get converted to be per-netns.
2037 ret
= raw_notifier_call_chain(&net
->netdev_chain
, val
, info
);
2038 if (ret
& NOTIFY_STOP_MASK
)
2040 return raw_notifier_call_chain(&netdev_chain
, val
, info
);
2043 static int call_netdevice_notifiers_extack(unsigned long val
,
2044 struct net_device
*dev
,
2045 struct netlink_ext_ack
*extack
)
2047 struct netdev_notifier_info info
= {
2052 return call_netdevice_notifiers_info(val
, &info
);
2056 * call_netdevice_notifiers - call all network notifier blocks
2057 * @val: value passed unmodified to notifier function
2058 * @dev: net_device pointer passed unmodified to notifier function
2060 * Call all network notifier blocks. Parameters and return value
2061 * are as for raw_notifier_call_chain().
2064 int call_netdevice_notifiers(unsigned long val
, struct net_device
*dev
)
2066 return call_netdevice_notifiers_extack(val
, dev
, NULL
);
2068 EXPORT_SYMBOL(call_netdevice_notifiers
);
2071 * call_netdevice_notifiers_mtu - call all network notifier blocks
2072 * @val: value passed unmodified to notifier function
2073 * @dev: net_device pointer passed unmodified to notifier function
2074 * @arg: additional u32 argument passed to the notifier function
2076 * Call all network notifier blocks. Parameters and return value
2077 * are as for raw_notifier_call_chain().
2079 static int call_netdevice_notifiers_mtu(unsigned long val
,
2080 struct net_device
*dev
, u32 arg
)
2082 struct netdev_notifier_info_ext info
= {
2087 BUILD_BUG_ON(offsetof(struct netdev_notifier_info_ext
, info
) != 0);
2089 return call_netdevice_notifiers_info(val
, &info
.info
);
2092 #ifdef CONFIG_NET_INGRESS
2093 static DEFINE_STATIC_KEY_FALSE(ingress_needed_key
);
2095 void net_inc_ingress_queue(void)
2097 static_branch_inc(&ingress_needed_key
);
2099 EXPORT_SYMBOL_GPL(net_inc_ingress_queue
);
2101 void net_dec_ingress_queue(void)
2103 static_branch_dec(&ingress_needed_key
);
2105 EXPORT_SYMBOL_GPL(net_dec_ingress_queue
);
2108 #ifdef CONFIG_NET_EGRESS
2109 static DEFINE_STATIC_KEY_FALSE(egress_needed_key
);
2111 void net_inc_egress_queue(void)
2113 static_branch_inc(&egress_needed_key
);
2115 EXPORT_SYMBOL_GPL(net_inc_egress_queue
);
2117 void net_dec_egress_queue(void)
2119 static_branch_dec(&egress_needed_key
);
2121 EXPORT_SYMBOL_GPL(net_dec_egress_queue
);
2124 static DEFINE_STATIC_KEY_FALSE(netstamp_needed_key
);
2125 #ifdef CONFIG_JUMP_LABEL
2126 static atomic_t netstamp_needed_deferred
;
2127 static atomic_t netstamp_wanted
;
2128 static void netstamp_clear(struct work_struct
*work
)
2130 int deferred
= atomic_xchg(&netstamp_needed_deferred
, 0);
2133 wanted
= atomic_add_return(deferred
, &netstamp_wanted
);
2135 static_branch_enable(&netstamp_needed_key
);
2137 static_branch_disable(&netstamp_needed_key
);
2139 static DECLARE_WORK(netstamp_work
, netstamp_clear
);
2142 void net_enable_timestamp(void)
2144 #ifdef CONFIG_JUMP_LABEL
2148 wanted
= atomic_read(&netstamp_wanted
);
2151 if (atomic_cmpxchg(&netstamp_wanted
, wanted
, wanted
+ 1) == wanted
)
2154 atomic_inc(&netstamp_needed_deferred
);
2155 schedule_work(&netstamp_work
);
2157 static_branch_inc(&netstamp_needed_key
);
2160 EXPORT_SYMBOL(net_enable_timestamp
);
2162 void net_disable_timestamp(void)
2164 #ifdef CONFIG_JUMP_LABEL
2168 wanted
= atomic_read(&netstamp_wanted
);
2171 if (atomic_cmpxchg(&netstamp_wanted
, wanted
, wanted
- 1) == wanted
)
2174 atomic_dec(&netstamp_needed_deferred
);
2175 schedule_work(&netstamp_work
);
2177 static_branch_dec(&netstamp_needed_key
);
2180 EXPORT_SYMBOL(net_disable_timestamp
);
2182 static inline void net_timestamp_set(struct sk_buff
*skb
)
2185 if (static_branch_unlikely(&netstamp_needed_key
))
2186 __net_timestamp(skb
);
2189 #define net_timestamp_check(COND, SKB) \
2190 if (static_branch_unlikely(&netstamp_needed_key)) { \
2191 if ((COND) && !(SKB)->tstamp) \
2192 __net_timestamp(SKB); \
2195 bool is_skb_forwardable(const struct net_device *dev, const struct sk_buff *skb)
2199 if (!(dev
->flags
& IFF_UP
))
2202 len
= dev
->mtu
+ dev
->hard_header_len
+ VLAN_HLEN
;
2203 if (skb
->len
<= len
)
2206 /* if TSO is enabled, we don't care about the length as the packet
2207 * could be forwarded without being segmented before
2209 if (skb_is_gso(skb
))
2214 EXPORT_SYMBOL_GPL(is_skb_forwardable
);
2216 int __dev_forward_skb(struct net_device
*dev
, struct sk_buff
*skb
)
2218 int ret
= ____dev_forward_skb(dev
, skb
);
2221 skb
->protocol
= eth_type_trans(skb
, dev
);
2222 skb_postpull_rcsum(skb
, eth_hdr(skb
), ETH_HLEN
);
2227 EXPORT_SYMBOL_GPL(__dev_forward_skb
);
2230 * dev_forward_skb - loopback an skb to another netif
2232 * @dev: destination network device
2233 * @skb: buffer to forward
2236 * NET_RX_SUCCESS (no congestion)
2237 * NET_RX_DROP (packet was dropped, but freed)
2239 * dev_forward_skb can be used for injecting an skb from the
2240 * start_xmit function of one device into the receive queue
2241 * of another device.
2243 * The receiving device may be in another namespace, so
2244 * we have to clear all information in the skb that could
2245 * impact namespace isolation.
2247 int dev_forward_skb(struct net_device
*dev
, struct sk_buff
*skb
)
2249 return __dev_forward_skb(dev
, skb
) ?: netif_rx_internal(skb
);
2251 EXPORT_SYMBOL_GPL(dev_forward_skb
);
2253 static inline int deliver_skb(struct sk_buff
*skb
,
2254 struct packet_type
*pt_prev
,
2255 struct net_device
*orig_dev
)
2257 if (unlikely(skb_orphan_frags_rx(skb
, GFP_ATOMIC
)))
2259 refcount_inc(&skb
->users
);
2260 return pt_prev
->func(skb
, skb
->dev
, pt_prev
, orig_dev
);
2263 static inline void deliver_ptype_list_skb(struct sk_buff
*skb
,
2264 struct packet_type
**pt
,
2265 struct net_device
*orig_dev
,
2267 struct list_head
*ptype_list
)
2269 struct packet_type
*ptype
, *pt_prev
= *pt
;
2271 list_for_each_entry_rcu(ptype
, ptype_list
, list
) {
2272 if (ptype
->type
!= type
)
2275 deliver_skb(skb
, pt_prev
, orig_dev
);
2281 static inline bool skb_loop_sk(struct packet_type
*ptype
, struct sk_buff
*skb
)
2283 if (!ptype
->af_packet_priv
|| !skb
->sk
)
2286 if (ptype
->id_match
)
2287 return ptype
->id_match(ptype
, skb
->sk
);
2288 else if ((struct sock
*)ptype
->af_packet_priv
== skb
->sk
)
2295 * dev_nit_active - return true if any network interface taps are in use
2297 * @dev: network device to check for the presence of taps
2299 bool dev_nit_active(struct net_device
*dev
)
2301 return !list_empty(&ptype_all
) || !list_empty(&dev
->ptype_all
);
2303 EXPORT_SYMBOL_GPL(dev_nit_active
);
2306 * Support routine. Sends outgoing frames to any network
2307 * taps currently in use.
2310 void dev_queue_xmit_nit(struct sk_buff
*skb
, struct net_device
*dev
)
2312 struct packet_type
*ptype
;
2313 struct sk_buff
*skb2
= NULL
;
2314 struct packet_type
*pt_prev
= NULL
;
2315 struct list_head
*ptype_list
= &ptype_all
;
2319 list_for_each_entry_rcu(ptype
, ptype_list
, list
) {
2320 if (ptype
->ignore_outgoing
)
2323 /* Never send packets back to the socket
2324 * they originated from - MvS (miquels@drinkel.ow.org)
2326 if (skb_loop_sk(ptype
, skb
))
2330 deliver_skb(skb2
, pt_prev
, skb
->dev
);
2335 /* need to clone skb, done only once */
2336 skb2
= skb_clone(skb
, GFP_ATOMIC
);
2340 net_timestamp_set(skb2
);
2342 /* skb->nh should be correctly
2343 * set by sender, so that the second statement is
2344 * just protection against buggy protocols.
2346 skb_reset_mac_header(skb2
);
2348 if (skb_network_header(skb2
) < skb2
->data
||
2349 skb_network_header(skb2
) > skb_tail_pointer(skb2
)) {
2350 net_crit_ratelimited("protocol %04x is buggy, dev %s\n",
2351 ntohs(skb2
->protocol
),
2353 skb_reset_network_header(skb2
);
2356 skb2
->transport_header
= skb2
->network_header
;
2357 skb2
->pkt_type
= PACKET_OUTGOING
;
2361 if (ptype_list
== &ptype_all
) {
2362 ptype_list
= &dev
->ptype_all
;
2367 if (!skb_orphan_frags_rx(skb2
, GFP_ATOMIC
))
2368 pt_prev
->func(skb2
, skb
->dev
, pt_prev
, skb
->dev
);
2374 EXPORT_SYMBOL_GPL(dev_queue_xmit_nit
);
2377 * netif_setup_tc - Handle tc mappings on real_num_tx_queues change
2378 * @dev: Network device
2379 * @txq: number of queues available
2381 * If real_num_tx_queues is changed the tc mappings may no longer be
2382 * valid. To resolve this verify the tc mapping remains valid and if
2383 * not NULL the mapping. With no priorities mapping to this
2384 * offset/count pair it will no longer be used. In the worst case TC0
2385 * is invalid nothing can be done so disable priority mappings. If is
2386 * expected that drivers will fix this mapping if they can before
2387 * calling netif_set_real_num_tx_queues.
2389 static void netif_setup_tc(struct net_device
*dev
, unsigned int txq
)
2392 struct netdev_tc_txq
*tc
= &dev
->tc_to_txq
[0];
2394 /* If TC0 is invalidated disable TC mapping */
2395 if (tc
->offset
+ tc
->count
> txq
) {
2396 pr_warn("Number of in use tx queues changed invalidating tc mappings. Priority traffic classification disabled!\n");
2401 /* Invalidated prio to tc mappings set to TC0 */
2402 for (i
= 1; i
< TC_BITMASK
+ 1; i
++) {
2403 int q
= netdev_get_prio_tc_map(dev
, i
);
2405 tc
= &dev
->tc_to_txq
[q
];
2406 if (tc
->offset
+ tc
->count
> txq
) {
2407 pr_warn("Number of in use tx queues changed. Priority %i to tc mapping %i is no longer valid. Setting map to 0\n",
2409 netdev_set_prio_tc_map(dev
, i
, 0);
2414 int netdev_txq_to_tc(struct net_device
*dev
, unsigned int txq
)
2417 struct netdev_tc_txq
*tc
= &dev
->tc_to_txq
[0];
2420 /* walk through the TCs and see if it falls into any of them */
2421 for (i
= 0; i
< TC_MAX_QUEUE
; i
++, tc
++) {
2422 if ((txq
- tc
->offset
) < tc
->count
)
2426 /* didn't find it, just return -1 to indicate no match */
2432 EXPORT_SYMBOL(netdev_txq_to_tc
);
2435 struct static_key xps_needed __read_mostly
;
2436 EXPORT_SYMBOL(xps_needed
);
2437 struct static_key xps_rxqs_needed __read_mostly
;
2438 EXPORT_SYMBOL(xps_rxqs_needed
);
2439 static DEFINE_MUTEX(xps_map_mutex
);
2440 #define xmap_dereference(P) \
2441 rcu_dereference_protected((P), lockdep_is_held(&xps_map_mutex))
2443 static bool remove_xps_queue(struct xps_dev_maps
*dev_maps
,
2446 struct xps_map
*map
= NULL
;
2450 map
= xmap_dereference(dev_maps
->attr_map
[tci
]);
2454 for (pos
= map
->len
; pos
--;) {
2455 if (map
->queues
[pos
] != index
)
2459 map
->queues
[pos
] = map
->queues
[--map
->len
];
2463 RCU_INIT_POINTER(dev_maps
->attr_map
[tci
], NULL
);
2464 kfree_rcu(map
, rcu
);
2471 static bool remove_xps_queue_cpu(struct net_device
*dev
,
2472 struct xps_dev_maps
*dev_maps
,
2473 int cpu
, u16 offset
, u16 count
)
2475 int num_tc
= dev
->num_tc
? : 1;
2476 bool active
= false;
2479 for (tci
= cpu
* num_tc
; num_tc
--; tci
++) {
2482 for (i
= count
, j
= offset
; i
--; j
++) {
2483 if (!remove_xps_queue(dev_maps
, tci
, j
))
2493 static void reset_xps_maps(struct net_device
*dev
,
2494 struct xps_dev_maps
*dev_maps
,
2498 static_key_slow_dec_cpuslocked(&xps_rxqs_needed
);
2499 RCU_INIT_POINTER(dev
->xps_rxqs_map
, NULL
);
2501 RCU_INIT_POINTER(dev
->xps_cpus_map
, NULL
);
2503 static_key_slow_dec_cpuslocked(&xps_needed
);
2504 kfree_rcu(dev_maps
, rcu
);
2507 static void clean_xps_maps(struct net_device
*dev
, const unsigned long *mask
,
2508 struct xps_dev_maps
*dev_maps
, unsigned int nr_ids
,
2509 u16 offset
, u16 count
, bool is_rxqs_map
)
2511 bool active
= false;
2514 for (j
= -1; j
= netif_attrmask_next(j
, mask
, nr_ids
),
2516 active
|= remove_xps_queue_cpu(dev
, dev_maps
, j
, offset
,
2519 reset_xps_maps(dev
, dev_maps
, is_rxqs_map
);
2522 for (i
= offset
+ (count
- 1); count
--; i
--) {
2523 netdev_queue_numa_node_write(
2524 netdev_get_tx_queue(dev
, i
),
2530 static void netif_reset_xps_queues(struct net_device
*dev
, u16 offset
,
2533 const unsigned long *possible_mask
= NULL
;
2534 struct xps_dev_maps
*dev_maps
;
2535 unsigned int nr_ids
;
2537 if (!static_key_false(&xps_needed
))
2541 mutex_lock(&xps_map_mutex
);
2543 if (static_key_false(&xps_rxqs_needed
)) {
2544 dev_maps
= xmap_dereference(dev
->xps_rxqs_map
);
2546 nr_ids
= dev
->num_rx_queues
;
2547 clean_xps_maps(dev
, possible_mask
, dev_maps
, nr_ids
,
2548 offset
, count
, true);
2552 dev_maps
= xmap_dereference(dev
->xps_cpus_map
);
2556 if (num_possible_cpus() > 1)
2557 possible_mask
= cpumask_bits(cpu_possible_mask
);
2558 nr_ids
= nr_cpu_ids
;
2559 clean_xps_maps(dev
, possible_mask
, dev_maps
, nr_ids
, offset
, count
,
2563 mutex_unlock(&xps_map_mutex
);
2567 static void netif_reset_xps_queues_gt(struct net_device
*dev
, u16 index
)
2569 netif_reset_xps_queues(dev
, index
, dev
->num_tx_queues
- index
);
2572 static struct xps_map
*expand_xps_map(struct xps_map
*map
, int attr_index
,
2573 u16 index
, bool is_rxqs_map
)
2575 struct xps_map
*new_map
;
2576 int alloc_len
= XPS_MIN_MAP_ALLOC
;
2579 for (pos
= 0; map
&& pos
< map
->len
; pos
++) {
2580 if (map
->queues
[pos
] != index
)
2585 /* Need to add tx-queue to this CPU's/rx-queue's existing map */
2587 if (pos
< map
->alloc_len
)
2590 alloc_len
= map
->alloc_len
* 2;
2593 /* Need to allocate new map to store tx-queue on this CPU's/rx-queue's
2597 new_map
= kzalloc(XPS_MAP_SIZE(alloc_len
), GFP_KERNEL
);
2599 new_map
= kzalloc_node(XPS_MAP_SIZE(alloc_len
), GFP_KERNEL
,
2600 cpu_to_node(attr_index
));
2604 for (i
= 0; i
< pos
; i
++)
2605 new_map
->queues
[i
] = map
->queues
[i
];
2606 new_map
->alloc_len
= alloc_len
;
2612 /* Must be called under cpus_read_lock */
2613 int __netif_set_xps_queue(struct net_device
*dev
, const unsigned long *mask
,
2614 u16 index
, bool is_rxqs_map
)
2616 const unsigned long *online_mask
= NULL
, *possible_mask
= NULL
;
2617 struct xps_dev_maps
*dev_maps
, *new_dev_maps
= NULL
;
2618 int i
, j
, tci
, numa_node_id
= -2;
2619 int maps_sz
, num_tc
= 1, tc
= 0;
2620 struct xps_map
*map
, *new_map
;
2621 bool active
= false;
2622 unsigned int nr_ids
;
2625 /* Do not allow XPS on subordinate device directly */
2626 num_tc
= dev
->num_tc
;
2630 /* If queue belongs to subordinate dev use its map */
2631 dev
= netdev_get_tx_queue(dev
, index
)->sb_dev
? : dev
;
2633 tc
= netdev_txq_to_tc(dev
, index
);
2638 mutex_lock(&xps_map_mutex
);
2640 maps_sz
= XPS_RXQ_DEV_MAPS_SIZE(num_tc
, dev
->num_rx_queues
);
2641 dev_maps
= xmap_dereference(dev
->xps_rxqs_map
);
2642 nr_ids
= dev
->num_rx_queues
;
2644 maps_sz
= XPS_CPU_DEV_MAPS_SIZE(num_tc
);
2645 if (num_possible_cpus() > 1) {
2646 online_mask
= cpumask_bits(cpu_online_mask
);
2647 possible_mask
= cpumask_bits(cpu_possible_mask
);
2649 dev_maps
= xmap_dereference(dev
->xps_cpus_map
);
2650 nr_ids
= nr_cpu_ids
;
2653 if (maps_sz
< L1_CACHE_BYTES
)
2654 maps_sz
= L1_CACHE_BYTES
;
2656 /* allocate memory for queue storage */
2657 for (j
= -1; j
= netif_attrmask_next_and(j
, online_mask
, mask
, nr_ids
),
2660 new_dev_maps
= kzalloc(maps_sz
, GFP_KERNEL
);
2661 if (!new_dev_maps
) {
2662 mutex_unlock(&xps_map_mutex
);
2666 tci
= j
* num_tc
+ tc
;
2667 map
= dev_maps
? xmap_dereference(dev_maps
->attr_map
[tci
]) :
2670 map
= expand_xps_map(map
, j
, index
, is_rxqs_map
);
2674 RCU_INIT_POINTER(new_dev_maps
->attr_map
[tci
], map
);
2678 goto out_no_new_maps
;
2681 /* Increment static keys at most once per type */
2682 static_key_slow_inc_cpuslocked(&xps_needed
);
2684 static_key_slow_inc_cpuslocked(&xps_rxqs_needed
);
2687 for (j
= -1; j
= netif_attrmask_next(j
, possible_mask
, nr_ids
),
2689 /* copy maps belonging to foreign traffic classes */
2690 for (i
= tc
, tci
= j
* num_tc
; dev_maps
&& i
--; tci
++) {
2691 /* fill in the new device map from the old device map */
2692 map
= xmap_dereference(dev_maps
->attr_map
[tci
]);
2693 RCU_INIT_POINTER(new_dev_maps
->attr_map
[tci
], map
);
2696 /* We need to explicitly update tci as prevous loop
2697 * could break out early if dev_maps is NULL.
2699 tci
= j
* num_tc
+ tc
;
2701 if (netif_attr_test_mask(j
, mask
, nr_ids
) &&
2702 netif_attr_test_online(j
, online_mask
, nr_ids
)) {
2703 /* add tx-queue to CPU/rx-queue maps */
2706 map
= xmap_dereference(new_dev_maps
->attr_map
[tci
]);
2707 while ((pos
< map
->len
) && (map
->queues
[pos
] != index
))
2710 if (pos
== map
->len
)
2711 map
->queues
[map
->len
++] = index
;
2714 if (numa_node_id
== -2)
2715 numa_node_id
= cpu_to_node(j
);
2716 else if (numa_node_id
!= cpu_to_node(j
))
2720 } else if (dev_maps
) {
2721 /* fill in the new device map from the old device map */
2722 map
= xmap_dereference(dev_maps
->attr_map
[tci
]);
2723 RCU_INIT_POINTER(new_dev_maps
->attr_map
[tci
], map
);
2726 /* copy maps belonging to foreign traffic classes */
2727 for (i
= num_tc
- tc
, tci
++; dev_maps
&& --i
; tci
++) {
2728 /* fill in the new device map from the old device map */
2729 map
= xmap_dereference(dev_maps
->attr_map
[tci
]);
2730 RCU_INIT_POINTER(new_dev_maps
->attr_map
[tci
], map
);
2735 rcu_assign_pointer(dev
->xps_rxqs_map
, new_dev_maps
);
2737 rcu_assign_pointer(dev
->xps_cpus_map
, new_dev_maps
);
2739 /* Cleanup old maps */
2741 goto out_no_old_maps
;
2743 for (j
= -1; j
= netif_attrmask_next(j
, possible_mask
, nr_ids
),
2745 for (i
= num_tc
, tci
= j
* num_tc
; i
--; tci
++) {
2746 new_map
= xmap_dereference(new_dev_maps
->attr_map
[tci
]);
2747 map
= xmap_dereference(dev_maps
->attr_map
[tci
]);
2748 if (map
&& map
!= new_map
)
2749 kfree_rcu(map
, rcu
);
2753 kfree_rcu(dev_maps
, rcu
);
2756 dev_maps
= new_dev_maps
;
2761 /* update Tx queue numa node */
2762 netdev_queue_numa_node_write(netdev_get_tx_queue(dev
, index
),
2763 (numa_node_id
>= 0) ?
2764 numa_node_id
: NUMA_NO_NODE
);
2770 /* removes tx-queue from unused CPUs/rx-queues */
2771 for (j
= -1; j
= netif_attrmask_next(j
, possible_mask
, nr_ids
),
2773 for (i
= tc
, tci
= j
* num_tc
; i
--; tci
++)
2774 active
|= remove_xps_queue(dev_maps
, tci
, index
);
2775 if (!netif_attr_test_mask(j
, mask
, nr_ids
) ||
2776 !netif_attr_test_online(j
, online_mask
, nr_ids
))
2777 active
|= remove_xps_queue(dev_maps
, tci
, index
);
2778 for (i
= num_tc
- tc
, tci
++; --i
; tci
++)
2779 active
|= remove_xps_queue(dev_maps
, tci
, index
);
2782 /* free map if not active */
2784 reset_xps_maps(dev
, dev_maps
, is_rxqs_map
);
2787 mutex_unlock(&xps_map_mutex
);
2791 /* remove any maps that we added */
2792 for (j
= -1; j
= netif_attrmask_next(j
, possible_mask
, nr_ids
),
2794 for (i
= num_tc
, tci
= j
* num_tc
; i
--; tci
++) {
2795 new_map
= xmap_dereference(new_dev_maps
->attr_map
[tci
]);
2797 xmap_dereference(dev_maps
->attr_map
[tci
]) :
2799 if (new_map
&& new_map
!= map
)
2804 mutex_unlock(&xps_map_mutex
);
2806 kfree(new_dev_maps
);
2809 EXPORT_SYMBOL_GPL(__netif_set_xps_queue
);
2811 int netif_set_xps_queue(struct net_device
*dev
, const struct cpumask
*mask
,
2817 ret
= __netif_set_xps_queue(dev
, cpumask_bits(mask
), index
, false);
2822 EXPORT_SYMBOL(netif_set_xps_queue
);
2825 static void netdev_unbind_all_sb_channels(struct net_device
*dev
)
2827 struct netdev_queue
*txq
= &dev
->_tx
[dev
->num_tx_queues
];
2829 /* Unbind any subordinate channels */
2830 while (txq
-- != &dev
->_tx
[0]) {
2832 netdev_unbind_sb_channel(dev
, txq
->sb_dev
);
2836 void netdev_reset_tc(struct net_device
*dev
)
2839 netif_reset_xps_queues_gt(dev
, 0);
2841 netdev_unbind_all_sb_channels(dev
);
2843 /* Reset TC configuration of device */
2845 memset(dev
->tc_to_txq
, 0, sizeof(dev
->tc_to_txq
));
2846 memset(dev
->prio_tc_map
, 0, sizeof(dev
->prio_tc_map
));
2848 EXPORT_SYMBOL(netdev_reset_tc
);
2850 int netdev_set_tc_queue(struct net_device
*dev
, u8 tc
, u16 count
, u16 offset
)
2852 if (tc
>= dev
->num_tc
)
2856 netif_reset_xps_queues(dev
, offset
, count
);
2858 dev
->tc_to_txq
[tc
].count
= count
;
2859 dev
->tc_to_txq
[tc
].offset
= offset
;
2862 EXPORT_SYMBOL(netdev_set_tc_queue
);
2864 int netdev_set_num_tc(struct net_device
*dev
, u8 num_tc
)
2866 if (num_tc
> TC_MAX_QUEUE
)
2870 netif_reset_xps_queues_gt(dev
, 0);
2872 netdev_unbind_all_sb_channels(dev
);
2874 dev
->num_tc
= num_tc
;
2877 EXPORT_SYMBOL(netdev_set_num_tc
);
2879 void netdev_unbind_sb_channel(struct net_device
*dev
,
2880 struct net_device
*sb_dev
)
2882 struct netdev_queue
*txq
= &dev
->_tx
[dev
->num_tx_queues
];
2885 netif_reset_xps_queues_gt(sb_dev
, 0);
2887 memset(sb_dev
->tc_to_txq
, 0, sizeof(sb_dev
->tc_to_txq
));
2888 memset(sb_dev
->prio_tc_map
, 0, sizeof(sb_dev
->prio_tc_map
));
2890 while (txq
-- != &dev
->_tx
[0]) {
2891 if (txq
->sb_dev
== sb_dev
)
2895 EXPORT_SYMBOL(netdev_unbind_sb_channel
);
2897 int netdev_bind_sb_channel_queue(struct net_device
*dev
,
2898 struct net_device
*sb_dev
,
2899 u8 tc
, u16 count
, u16 offset
)
2901 /* Make certain the sb_dev and dev are already configured */
2902 if (sb_dev
->num_tc
>= 0 || tc
>= dev
->num_tc
)
2905 /* We cannot hand out queues we don't have */
2906 if ((offset
+ count
) > dev
->real_num_tx_queues
)
2909 /* Record the mapping */
2910 sb_dev
->tc_to_txq
[tc
].count
= count
;
2911 sb_dev
->tc_to_txq
[tc
].offset
= offset
;
2913 /* Provide a way for Tx queue to find the tc_to_txq map or
2914 * XPS map for itself.
2917 netdev_get_tx_queue(dev
, count
+ offset
)->sb_dev
= sb_dev
;
2921 EXPORT_SYMBOL(netdev_bind_sb_channel_queue
);
2923 int netdev_set_sb_channel(struct net_device
*dev
, u16 channel
)
2925 /* Do not use a multiqueue device to represent a subordinate channel */
2926 if (netif_is_multiqueue(dev
))
2929 /* We allow channels 1 - 32767 to be used for subordinate channels.
2930 * Channel 0 is meant to be "native" mode and used only to represent
2931 * the main root device. We allow writing 0 to reset the device back
2932 * to normal mode after being used as a subordinate channel.
2934 if (channel
> S16_MAX
)
2937 dev
->num_tc
= -channel
;
2941 EXPORT_SYMBOL(netdev_set_sb_channel
);
2944 * Routine to help set real_num_tx_queues. To avoid skbs mapped to queues
2945 * greater than real_num_tx_queues stale skbs on the qdisc must be flushed.
2947 int netif_set_real_num_tx_queues(struct net_device
*dev
, unsigned int txq
)
2952 disabling
= txq
< dev
->real_num_tx_queues
;
2954 if (txq
< 1 || txq
> dev
->num_tx_queues
)
2957 if (dev
->reg_state
== NETREG_REGISTERED
||
2958 dev
->reg_state
== NETREG_UNREGISTERING
) {
2961 rc
= netdev_queue_update_kobjects(dev
, dev
->real_num_tx_queues
,
2967 netif_setup_tc(dev
, txq
);
2969 dev
->real_num_tx_queues
= txq
;
2973 qdisc_reset_all_tx_gt(dev
, txq
);
2975 netif_reset_xps_queues_gt(dev
, txq
);
2979 dev
->real_num_tx_queues
= txq
;
2984 EXPORT_SYMBOL(netif_set_real_num_tx_queues
);
2988 * netif_set_real_num_rx_queues - set actual number of RX queues used
2989 * @dev: Network device
2990 * @rxq: Actual number of RX queues
2992 * This must be called either with the rtnl_lock held or before
2993 * registration of the net device. Returns 0 on success, or a
2994 * negative error code. If called before registration, it always
2997 int netif_set_real_num_rx_queues(struct net_device
*dev
, unsigned int rxq
)
3001 if (rxq
< 1 || rxq
> dev
->num_rx_queues
)
3004 if (dev
->reg_state
== NETREG_REGISTERED
) {
3007 rc
= net_rx_queue_update_kobjects(dev
, dev
->real_num_rx_queues
,
3013 dev
->real_num_rx_queues
= rxq
;
3016 EXPORT_SYMBOL(netif_set_real_num_rx_queues
);
3020 * netif_get_num_default_rss_queues - default number of RSS queues
3022 * This routine should set an upper limit on the number of RSS queues
3023 * used by default by multiqueue devices.
3025 int netif_get_num_default_rss_queues(void)
3027 return is_kdump_kernel() ?
3028 1 : min_t(int, DEFAULT_MAX_NUM_RSS_QUEUES
, num_online_cpus());
3030 EXPORT_SYMBOL(netif_get_num_default_rss_queues
);
3032 static void __netif_reschedule(struct Qdisc
*q
)
3034 struct softnet_data
*sd
;
3035 unsigned long flags
;
3037 local_irq_save(flags
);
3038 sd
= this_cpu_ptr(&softnet_data
);
3039 q
->next_sched
= NULL
;
3040 *sd
->output_queue_tailp
= q
;
3041 sd
->output_queue_tailp
= &q
->next_sched
;
3042 raise_softirq_irqoff(NET_TX_SOFTIRQ
);
3043 local_irq_restore(flags
);
3046 void __netif_schedule(struct Qdisc
*q
)
3048 if (!test_and_set_bit(__QDISC_STATE_SCHED
, &q
->state
))
3049 __netif_reschedule(q
);
3051 EXPORT_SYMBOL(__netif_schedule
);
3053 struct dev_kfree_skb_cb
{
3054 enum skb_free_reason reason
;
3057 static struct dev_kfree_skb_cb
*get_kfree_skb_cb(const struct sk_buff
*skb
)
3059 return (struct dev_kfree_skb_cb
*)skb
->cb
;
3062 void netif_schedule_queue(struct netdev_queue
*txq
)
3065 if (!netif_xmit_stopped(txq
)) {
3066 struct Qdisc
*q
= rcu_dereference(txq
->qdisc
);
3068 __netif_schedule(q
);
3072 EXPORT_SYMBOL(netif_schedule_queue
);
3074 void netif_tx_wake_queue(struct netdev_queue
*dev_queue
)
3076 if (test_and_clear_bit(__QUEUE_STATE_DRV_XOFF
, &dev_queue
->state
)) {
3080 q
= rcu_dereference(dev_queue
->qdisc
);
3081 __netif_schedule(q
);
3085 EXPORT_SYMBOL(netif_tx_wake_queue
);
3087 void __dev_kfree_skb_irq(struct sk_buff
*skb
, enum skb_free_reason reason
)
3089 unsigned long flags
;
3094 if (likely(refcount_read(&skb
->users
) == 1)) {
3096 refcount_set(&skb
->users
, 0);
3097 } else if (likely(!refcount_dec_and_test(&skb
->users
))) {
3100 get_kfree_skb_cb(skb
)->reason
= reason
;
3101 local_irq_save(flags
);
3102 skb
->next
= __this_cpu_read(softnet_data
.completion_queue
);
3103 __this_cpu_write(softnet_data
.completion_queue
, skb
);
3104 raise_softirq_irqoff(NET_TX_SOFTIRQ
);
3105 local_irq_restore(flags
);
3107 EXPORT_SYMBOL(__dev_kfree_skb_irq
);
3109 void __dev_kfree_skb_any(struct sk_buff
*skb
, enum skb_free_reason reason
)
3111 if (in_irq() || irqs_disabled())
3112 __dev_kfree_skb_irq(skb
, reason
);
3116 EXPORT_SYMBOL(__dev_kfree_skb_any
);
3120 * netif_device_detach - mark device as removed
3121 * @dev: network device
3123 * Mark device as removed from system and therefore no longer available.
3125 void netif_device_detach(struct net_device
*dev
)
3127 if (test_and_clear_bit(__LINK_STATE_PRESENT
, &dev
->state
) &&
3128 netif_running(dev
)) {
3129 netif_tx_stop_all_queues(dev
);
3132 EXPORT_SYMBOL(netif_device_detach
);
3135 * netif_device_attach - mark device as attached
3136 * @dev: network device
3138 * Mark device as attached from system and restart if needed.
3140 void netif_device_attach(struct net_device
*dev
)
3142 if (!test_and_set_bit(__LINK_STATE_PRESENT
, &dev
->state
) &&
3143 netif_running(dev
)) {
3144 netif_tx_wake_all_queues(dev
);
3145 __netdev_watchdog_up(dev
);
3148 EXPORT_SYMBOL(netif_device_attach
);
3151 * Returns a Tx hash based on the given packet descriptor a Tx queues' number
3152 * to be used as a distribution range.
3154 static u16
skb_tx_hash(const struct net_device
*dev
,
3155 const struct net_device
*sb_dev
,
3156 struct sk_buff
*skb
)
3160 u16 qcount
= dev
->real_num_tx_queues
;
3163 u8 tc
= netdev_get_prio_tc_map(dev
, skb
->priority
);
3165 qoffset
= sb_dev
->tc_to_txq
[tc
].offset
;
3166 qcount
= sb_dev
->tc_to_txq
[tc
].count
;
3169 if (skb_rx_queue_recorded(skb
)) {
3170 hash
= skb_get_rx_queue(skb
);
3171 if (hash
>= qoffset
)
3173 while (unlikely(hash
>= qcount
))
3175 return hash
+ qoffset
;
3178 return (u16
) reciprocal_scale(skb_get_hash(skb
), qcount
) + qoffset
;
3181 static void skb_warn_bad_offload(const struct sk_buff
*skb
)
3183 static const netdev_features_t null_features
;
3184 struct net_device
*dev
= skb
->dev
;
3185 const char *name
= "";
3187 if (!net_ratelimit())
3191 if (dev
->dev
.parent
)
3192 name
= dev_driver_string(dev
->dev
.parent
);
3194 name
= netdev_name(dev
);
3196 skb_dump(KERN_WARNING
, skb
, false);
3197 WARN(1, "%s: caps=(%pNF, %pNF)\n",
3198 name
, dev
? &dev
->features
: &null_features
,
3199 skb
->sk
? &skb
->sk
->sk_route_caps
: &null_features
);
3203 * Invalidate hardware checksum when packet is to be mangled, and
3204 * complete checksum manually on outgoing path.
3206 int skb_checksum_help(struct sk_buff
*skb
)
3209 int ret
= 0, offset
;
3211 if (skb
->ip_summed
== CHECKSUM_COMPLETE
)
3212 goto out_set_summed
;
3214 if (unlikely(skb_is_gso(skb
))) {
3215 skb_warn_bad_offload(skb
);
3219 /* Before computing a checksum, we should make sure no frag could
3220 * be modified by an external entity : checksum could be wrong.
3222 if (skb_has_shared_frag(skb
)) {
3223 ret
= __skb_linearize(skb
);
3228 offset
= skb_checksum_start_offset(skb
);
3229 BUG_ON(offset
>= skb_headlen(skb
));
3230 csum
= skb_checksum(skb
, offset
, skb
->len
- offset
, 0);
3232 offset
+= skb
->csum_offset
;
3233 BUG_ON(offset
+ sizeof(__sum16
) > skb_headlen(skb
));
3235 ret
= skb_ensure_writable(skb
, offset
+ sizeof(__sum16
));
3239 *(__sum16
*)(skb
->data
+ offset
) = csum_fold(csum
) ?: CSUM_MANGLED_0
;
3241 skb
->ip_summed
= CHECKSUM_NONE
;
3245 EXPORT_SYMBOL(skb_checksum_help
);
3247 int skb_crc32c_csum_help(struct sk_buff
*skb
)
3250 int ret
= 0, offset
, start
;
3252 if (skb
->ip_summed
!= CHECKSUM_PARTIAL
)
3255 if (unlikely(skb_is_gso(skb
)))
3258 /* Before computing a checksum, we should make sure no frag could
3259 * be modified by an external entity : checksum could be wrong.
3261 if (unlikely(skb_has_shared_frag(skb
))) {
3262 ret
= __skb_linearize(skb
);
3266 start
= skb_checksum_start_offset(skb
);
3267 offset
= start
+ offsetof(struct sctphdr
, checksum
);
3268 if (WARN_ON_ONCE(offset
>= skb_headlen(skb
))) {
3273 ret
= skb_ensure_writable(skb
, offset
+ sizeof(__le32
));
3277 crc32c_csum
= cpu_to_le32(~__skb_checksum(skb
, start
,
3278 skb
->len
- start
, ~(__u32
)0,
3280 *(__le32
*)(skb
->data
+ offset
) = crc32c_csum
;
3281 skb
->ip_summed
= CHECKSUM_NONE
;
3282 skb
->csum_not_inet
= 0;
3287 __be16
skb_network_protocol(struct sk_buff
*skb
, int *depth
)
3289 __be16 type
= skb
->protocol
;
3291 /* Tunnel gso handlers can set protocol to ethernet. */
3292 if (type
== htons(ETH_P_TEB
)) {
3295 if (unlikely(!pskb_may_pull(skb
, sizeof(struct ethhdr
))))
3298 eth
= (struct ethhdr
*)skb
->data
;
3299 type
= eth
->h_proto
;
3302 return __vlan_get_protocol(skb
, type
, depth
);
3306 * skb_mac_gso_segment - mac layer segmentation handler.
3307 * @skb: buffer to segment
3308 * @features: features for the output path (see dev->features)
3310 struct sk_buff
*skb_mac_gso_segment(struct sk_buff
*skb
,
3311 netdev_features_t features
)
3313 struct sk_buff
*segs
= ERR_PTR(-EPROTONOSUPPORT
);
3314 struct packet_offload
*ptype
;
3315 int vlan_depth
= skb
->mac_len
;
3316 __be16 type
= skb_network_protocol(skb
, &vlan_depth
);
3318 if (unlikely(!type
))
3319 return ERR_PTR(-EINVAL
);
3321 __skb_pull(skb
, vlan_depth
);
3324 list_for_each_entry_rcu(ptype
, &offload_base
, list
) {
3325 if (ptype
->type
== type
&& ptype
->callbacks
.gso_segment
) {
3326 segs
= ptype
->callbacks
.gso_segment(skb
, features
);
3332 __skb_push(skb
, skb
->data
- skb_mac_header(skb
));
3336 EXPORT_SYMBOL(skb_mac_gso_segment
);
3339 /* openvswitch calls this on rx path, so we need a different check.
3341 static inline bool skb_needs_check(struct sk_buff
*skb
, bool tx_path
)
3344 return skb
->ip_summed
!= CHECKSUM_PARTIAL
&&
3345 skb
->ip_summed
!= CHECKSUM_UNNECESSARY
;
3347 return skb
->ip_summed
== CHECKSUM_NONE
;
3351 * __skb_gso_segment - Perform segmentation on skb.
3352 * @skb: buffer to segment
3353 * @features: features for the output path (see dev->features)
3354 * @tx_path: whether it is called in TX path
3356 * This function segments the given skb and returns a list of segments.
3358 * It may return NULL if the skb requires no segmentation. This is
3359 * only possible when GSO is used for verifying header integrity.
3361 * Segmentation preserves SKB_GSO_CB_OFFSET bytes of previous skb cb.
3363 struct sk_buff
*__skb_gso_segment(struct sk_buff
*skb
,
3364 netdev_features_t features
, bool tx_path
)
3366 struct sk_buff
*segs
;
3368 if (unlikely(skb_needs_check(skb
, tx_path
))) {
3371 /* We're going to init ->check field in TCP or UDP header */
3372 err
= skb_cow_head(skb
, 0);
3374 return ERR_PTR(err
);
3377 /* Only report GSO partial support if it will enable us to
3378 * support segmentation on this frame without needing additional
3381 if (features
& NETIF_F_GSO_PARTIAL
) {
3382 netdev_features_t partial_features
= NETIF_F_GSO_ROBUST
;
3383 struct net_device
*dev
= skb
->dev
;
3385 partial_features
|= dev
->features
& dev
->gso_partial_features
;
3386 if (!skb_gso_ok(skb
, features
| partial_features
))
3387 features
&= ~NETIF_F_GSO_PARTIAL
;
3390 BUILD_BUG_ON(SKB_GSO_CB_OFFSET
+
3391 sizeof(*SKB_GSO_CB(skb
)) > sizeof(skb
->cb
));
3393 SKB_GSO_CB(skb
)->mac_offset
= skb_headroom(skb
);
3394 SKB_GSO_CB(skb
)->encap_level
= 0;
3396 skb_reset_mac_header(skb
);
3397 skb_reset_mac_len(skb
);
3399 segs
= skb_mac_gso_segment(skb
, features
);
3401 if (segs
!= skb
&& unlikely(skb_needs_check(skb
, tx_path
) && !IS_ERR(segs
)))
3402 skb_warn_bad_offload(skb
);
3406 EXPORT_SYMBOL(__skb_gso_segment
);
3408 /* Take action when hardware reception checksum errors are detected. */
3410 void netdev_rx_csum_fault(struct net_device
*dev
, struct sk_buff
*skb
)
3412 if (net_ratelimit()) {
3413 pr_err("%s: hw csum failure\n", dev
? dev
->name
: "<unknown>");
3414 skb_dump(KERN_ERR
, skb
, true);
3418 EXPORT_SYMBOL(netdev_rx_csum_fault
);
3421 /* XXX: check that highmem exists at all on the given machine. */
3422 static int illegal_highdma(struct net_device
*dev
, struct sk_buff
*skb
)
3424 #ifdef CONFIG_HIGHMEM
3427 if (!(dev
->features
& NETIF_F_HIGHDMA
)) {
3428 for (i
= 0; i
< skb_shinfo(skb
)->nr_frags
; i
++) {
3429 skb_frag_t
*frag
= &skb_shinfo(skb
)->frags
[i
];
3431 if (PageHighMem(skb_frag_page(frag
)))
3439 /* If MPLS offload request, verify we are testing hardware MPLS features
3440 * instead of standard features for the netdev.
3442 #if IS_ENABLED(CONFIG_NET_MPLS_GSO)
3443 static netdev_features_t
net_mpls_features(struct sk_buff
*skb
,
3444 netdev_features_t features
,
3447 if (eth_p_mpls(type
))
3448 features
&= skb
->dev
->mpls_features
;
3453 static netdev_features_t
net_mpls_features(struct sk_buff
*skb
,
3454 netdev_features_t features
,
3461 static netdev_features_t
harmonize_features(struct sk_buff
*skb
,
3462 netdev_features_t features
)
3466 type
= skb_network_protocol(skb
, NULL
);
3467 features
= net_mpls_features(skb
, features
, type
);
3469 if (skb
->ip_summed
!= CHECKSUM_NONE
&&
3470 !can_checksum_protocol(features
, type
)) {
3471 features
&= ~(NETIF_F_CSUM_MASK
| NETIF_F_GSO_MASK
);
3473 if (illegal_highdma(skb
->dev
, skb
))
3474 features
&= ~NETIF_F_SG
;
3479 netdev_features_t
passthru_features_check(struct sk_buff
*skb
,
3480 struct net_device
*dev
,
3481 netdev_features_t features
)
3485 EXPORT_SYMBOL(passthru_features_check
);
3487 static netdev_features_t
dflt_features_check(struct sk_buff
*skb
,
3488 struct net_device
*dev
,
3489 netdev_features_t features
)
3491 return vlan_features_check(skb
, features
);
3494 static netdev_features_t
gso_features_check(const struct sk_buff
*skb
,
3495 struct net_device
*dev
,
3496 netdev_features_t features
)
3498 u16 gso_segs
= skb_shinfo(skb
)->gso_segs
;
3500 if (gso_segs
> dev
->gso_max_segs
)
3501 return features
& ~NETIF_F_GSO_MASK
;
3503 if (!skb_shinfo(skb
)->gso_type
) {
3504 skb_warn_bad_offload(skb
);
3505 return features
& ~NETIF_F_GSO_MASK
;
3508 /* Support for GSO partial features requires software
3509 * intervention before we can actually process the packets
3510 * so we need to strip support for any partial features now
3511 * and we can pull them back in after we have partially
3512 * segmented the frame.
3514 if (!(skb_shinfo(skb
)->gso_type
& SKB_GSO_PARTIAL
))
3515 features
&= ~dev
->gso_partial_features
;
3517 /* Make sure to clear the IPv4 ID mangling feature if the
3518 * IPv4 header has the potential to be fragmented.
3520 if (skb_shinfo(skb
)->gso_type
& SKB_GSO_TCPV4
) {
3521 struct iphdr
*iph
= skb
->encapsulation
?
3522 inner_ip_hdr(skb
) : ip_hdr(skb
);
3524 if (!(iph
->frag_off
& htons(IP_DF
)))
3525 features
&= ~NETIF_F_TSO_MANGLEID
;
3531 netdev_features_t
netif_skb_features(struct sk_buff
*skb
)
3533 struct net_device
*dev
= skb
->dev
;
3534 netdev_features_t features
= dev
->features
;
3536 if (skb_is_gso(skb
))
3537 features
= gso_features_check(skb
, dev
, features
);
3539 /* If encapsulation offload request, verify we are testing
3540 * hardware encapsulation features instead of standard
3541 * features for the netdev
3543 if (skb
->encapsulation
)
3544 features
&= dev
->hw_enc_features
;
3546 if (skb_vlan_tagged(skb
))
3547 features
= netdev_intersect_features(features
,
3548 dev
->vlan_features
|
3549 NETIF_F_HW_VLAN_CTAG_TX
|
3550 NETIF_F_HW_VLAN_STAG_TX
);
3552 if (dev
->netdev_ops
->ndo_features_check
)
3553 features
&= dev
->netdev_ops
->ndo_features_check(skb
, dev
,
3556 features
&= dflt_features_check(skb
, dev
, features
);
3558 return harmonize_features(skb
, features
);
3560 EXPORT_SYMBOL(netif_skb_features
);
3562 static int xmit_one(struct sk_buff
*skb
, struct net_device
*dev
,
3563 struct netdev_queue
*txq
, bool more
)
3568 if (dev_nit_active(dev
))
3569 dev_queue_xmit_nit(skb
, dev
);
3572 PRANDOM_ADD_NOISE(skb
, dev
, txq
, len
+ jiffies
);
3573 trace_net_dev_start_xmit(skb
, dev
);
3574 rc
= netdev_start_xmit(skb
, dev
, txq
, more
);
3575 trace_net_dev_xmit(skb
, rc
, dev
, len
);
3580 struct sk_buff
*dev_hard_start_xmit(struct sk_buff
*first
, struct net_device
*dev
,
3581 struct netdev_queue
*txq
, int *ret
)
3583 struct sk_buff
*skb
= first
;
3584 int rc
= NETDEV_TX_OK
;
3587 struct sk_buff
*next
= skb
->next
;
3589 skb_mark_not_on_list(skb
);
3590 rc
= xmit_one(skb
, dev
, txq
, next
!= NULL
);
3591 if (unlikely(!dev_xmit_complete(rc
))) {
3597 if (netif_tx_queue_stopped(txq
) && skb
) {
3598 rc
= NETDEV_TX_BUSY
;
3608 static struct sk_buff
*validate_xmit_vlan(struct sk_buff
*skb
,
3609 netdev_features_t features
)
3611 if (skb_vlan_tag_present(skb
) &&
3612 !vlan_hw_offload_capable(features
, skb
->vlan_proto
))
3613 skb
= __vlan_hwaccel_push_inside(skb
);
3617 int skb_csum_hwoffload_help(struct sk_buff
*skb
,
3618 const netdev_features_t features
)
3620 if (unlikely(skb
->csum_not_inet
))
3621 return !!(features
& NETIF_F_SCTP_CRC
) ? 0 :
3622 skb_crc32c_csum_help(skb
);
3624 return !!(features
& NETIF_F_CSUM_MASK
) ? 0 : skb_checksum_help(skb
);
3626 EXPORT_SYMBOL(skb_csum_hwoffload_help
);
3628 static struct sk_buff
*validate_xmit_skb(struct sk_buff
*skb
, struct net_device
*dev
, bool *again
)
3630 netdev_features_t features
;
3632 features
= netif_skb_features(skb
);
3633 skb
= validate_xmit_vlan(skb
, features
);
3637 skb
= sk_validate_xmit_skb(skb
, dev
);
3641 if (netif_needs_gso(skb
, features
)) {
3642 struct sk_buff
*segs
;
3644 segs
= skb_gso_segment(skb
, features
);
3652 if (skb_needs_linearize(skb
, features
) &&
3653 __skb_linearize(skb
))
3656 /* If packet is not checksummed and device does not
3657 * support checksumming for this protocol, complete
3658 * checksumming here.
3660 if (skb
->ip_summed
== CHECKSUM_PARTIAL
) {
3661 if (skb
->encapsulation
)
3662 skb_set_inner_transport_header(skb
,
3663 skb_checksum_start_offset(skb
));
3665 skb_set_transport_header(skb
,
3666 skb_checksum_start_offset(skb
));
3667 if (skb_csum_hwoffload_help(skb
, features
))
3672 skb
= validate_xmit_xfrm(skb
, features
, again
);
3679 atomic_long_inc(&dev
->tx_dropped
);
3683 struct sk_buff
*validate_xmit_skb_list(struct sk_buff
*skb
, struct net_device
*dev
, bool *again
)
3685 struct sk_buff
*next
, *head
= NULL
, *tail
;
3687 for (; skb
!= NULL
; skb
= next
) {
3689 skb_mark_not_on_list(skb
);
3691 /* in case skb wont be segmented, point to itself */
3694 skb
= validate_xmit_skb(skb
, dev
, again
);
3702 /* If skb was segmented, skb->prev points to
3703 * the last segment. If not, it still contains skb.
3709 EXPORT_SYMBOL_GPL(validate_xmit_skb_list
);
3711 static void qdisc_pkt_len_init(struct sk_buff
*skb
)
3713 const struct skb_shared_info
*shinfo
= skb_shinfo(skb
);
3715 qdisc_skb_cb(skb
)->pkt_len
= skb
->len
;
3717 /* To get more precise estimation of bytes sent on wire,
3718 * we add to pkt_len the headers size of all segments
3720 if (shinfo
->gso_size
&& skb_transport_header_was_set(skb
)) {
3721 unsigned int hdr_len
;
3722 u16 gso_segs
= shinfo
->gso_segs
;
3724 /* mac layer + network layer */
3725 hdr_len
= skb_transport_header(skb
) - skb_mac_header(skb
);
3727 /* + transport layer */
3728 if (likely(shinfo
->gso_type
& (SKB_GSO_TCPV4
| SKB_GSO_TCPV6
))) {
3729 const struct tcphdr
*th
;
3730 struct tcphdr _tcphdr
;
3732 th
= skb_header_pointer(skb
, skb_transport_offset(skb
),
3733 sizeof(_tcphdr
), &_tcphdr
);
3735 hdr_len
+= __tcp_hdrlen(th
);
3737 struct udphdr _udphdr
;
3739 if (skb_header_pointer(skb
, skb_transport_offset(skb
),
3740 sizeof(_udphdr
), &_udphdr
))
3741 hdr_len
+= sizeof(struct udphdr
);
3744 if (shinfo
->gso_type
& SKB_GSO_DODGY
)
3745 gso_segs
= DIV_ROUND_UP(skb
->len
- hdr_len
,
3748 qdisc_skb_cb(skb
)->pkt_len
+= (gso_segs
- 1) * hdr_len
;
3752 static inline int __dev_xmit_skb(struct sk_buff
*skb
, struct Qdisc
*q
,
3753 struct net_device
*dev
,
3754 struct netdev_queue
*txq
)
3756 spinlock_t
*root_lock
= qdisc_lock(q
);
3757 struct sk_buff
*to_free
= NULL
;
3761 qdisc_calculate_pkt_len(skb
, q
);
3763 if (q
->flags
& TCQ_F_NOLOCK
) {
3764 rc
= q
->enqueue(skb
, q
, &to_free
) & NET_XMIT_MASK
;
3767 if (unlikely(to_free
))
3768 kfree_skb_list(to_free
);
3773 * Heuristic to force contended enqueues to serialize on a
3774 * separate lock before trying to get qdisc main lock.
3775 * This permits qdisc->running owner to get the lock more
3776 * often and dequeue packets faster.
3778 contended
= qdisc_is_running(q
);
3779 if (unlikely(contended
))
3780 spin_lock(&q
->busylock
);
3782 spin_lock(root_lock
);
3783 if (unlikely(test_bit(__QDISC_STATE_DEACTIVATED
, &q
->state
))) {
3784 __qdisc_drop(skb
, &to_free
);
3786 } else if ((q
->flags
& TCQ_F_CAN_BYPASS
) && !qdisc_qlen(q
) &&
3787 qdisc_run_begin(q
)) {
3789 * This is a work-conserving queue; there are no old skbs
3790 * waiting to be sent out; and the qdisc is not running -
3791 * xmit the skb directly.
3794 qdisc_bstats_update(q
, skb
);
3796 if (sch_direct_xmit(skb
, q
, dev
, txq
, root_lock
, true)) {
3797 if (unlikely(contended
)) {
3798 spin_unlock(&q
->busylock
);
3805 rc
= NET_XMIT_SUCCESS
;
3807 rc
= q
->enqueue(skb
, q
, &to_free
) & NET_XMIT_MASK
;
3808 if (qdisc_run_begin(q
)) {
3809 if (unlikely(contended
)) {
3810 spin_unlock(&q
->busylock
);
3817 spin_unlock(root_lock
);
3818 if (unlikely(to_free
))
3819 kfree_skb_list(to_free
);
3820 if (unlikely(contended
))
3821 spin_unlock(&q
->busylock
);
3825 #if IS_ENABLED(CONFIG_CGROUP_NET_PRIO)
3826 static void skb_update_prio(struct sk_buff
*skb
)
3828 const struct netprio_map
*map
;
3829 const struct sock
*sk
;
3830 unsigned int prioidx
;
3834 map
= rcu_dereference_bh(skb
->dev
->priomap
);
3837 sk
= skb_to_full_sk(skb
);
3841 prioidx
= sock_cgroup_prioidx(&sk
->sk_cgrp_data
);
3843 if (prioidx
< map
->priomap_len
)
3844 skb
->priority
= map
->priomap
[prioidx
];
3847 #define skb_update_prio(skb)
3851 * dev_loopback_xmit - loop back @skb
3852 * @net: network namespace this loopback is happening in
3853 * @sk: sk needed to be a netfilter okfn
3854 * @skb: buffer to transmit
3856 int dev_loopback_xmit(struct net
*net
, struct sock
*sk
, struct sk_buff
*skb
)
3858 skb_reset_mac_header(skb
);
3859 __skb_pull(skb
, skb_network_offset(skb
));
3860 skb
->pkt_type
= PACKET_LOOPBACK
;
3861 skb
->ip_summed
= CHECKSUM_UNNECESSARY
;
3862 WARN_ON(!skb_dst(skb
));
3867 EXPORT_SYMBOL(dev_loopback_xmit
);
3869 #ifdef CONFIG_NET_EGRESS
3870 static struct sk_buff
*
3871 sch_handle_egress(struct sk_buff
*skb
, int *ret
, struct net_device
*dev
)
3873 struct mini_Qdisc
*miniq
= rcu_dereference_bh(dev
->miniq_egress
);
3874 struct tcf_result cl_res
;
3879 /* qdisc_skb_cb(skb)->pkt_len was already set by the caller. */
3880 qdisc_skb_cb(skb
)->mru
= 0;
3881 mini_qdisc_bstats_cpu_update(miniq
, skb
);
3883 switch (tcf_classify(skb
, miniq
->filter_list
, &cl_res
, false)) {
3885 case TC_ACT_RECLASSIFY
:
3886 skb
->tc_index
= TC_H_MIN(cl_res
.classid
);
3889 mini_qdisc_qstats_cpu_drop(miniq
);
3890 *ret
= NET_XMIT_DROP
;
3896 *ret
= NET_XMIT_SUCCESS
;
3899 case TC_ACT_REDIRECT
:
3900 /* No need to push/pop skb's mac_header here on egress! */
3901 skb_do_redirect(skb
);
3902 *ret
= NET_XMIT_SUCCESS
;
3910 #endif /* CONFIG_NET_EGRESS */
3913 static int __get_xps_queue_idx(struct net_device
*dev
, struct sk_buff
*skb
,
3914 struct xps_dev_maps
*dev_maps
, unsigned int tci
)
3916 struct xps_map
*map
;
3917 int queue_index
= -1;
3921 tci
+= netdev_get_prio_tc_map(dev
, skb
->priority
);
3924 map
= rcu_dereference(dev_maps
->attr_map
[tci
]);
3927 queue_index
= map
->queues
[0];
3929 queue_index
= map
->queues
[reciprocal_scale(
3930 skb_get_hash(skb
), map
->len
)];
3931 if (unlikely(queue_index
>= dev
->real_num_tx_queues
))
3938 static int get_xps_queue(struct net_device
*dev
, struct net_device
*sb_dev
,
3939 struct sk_buff
*skb
)
3942 struct xps_dev_maps
*dev_maps
;
3943 struct sock
*sk
= skb
->sk
;
3944 int queue_index
= -1;
3946 if (!static_key_false(&xps_needed
))
3950 if (!static_key_false(&xps_rxqs_needed
))
3953 dev_maps
= rcu_dereference(sb_dev
->xps_rxqs_map
);
3955 int tci
= sk_rx_queue_get(sk
);
3957 if (tci
>= 0 && tci
< dev
->num_rx_queues
)
3958 queue_index
= __get_xps_queue_idx(dev
, skb
, dev_maps
,
3963 if (queue_index
< 0) {
3964 dev_maps
= rcu_dereference(sb_dev
->xps_cpus_map
);
3966 unsigned int tci
= skb
->sender_cpu
- 1;
3968 queue_index
= __get_xps_queue_idx(dev
, skb
, dev_maps
,
3980 u16
dev_pick_tx_zero(struct net_device
*dev
, struct sk_buff
*skb
,
3981 struct net_device
*sb_dev
)
3985 EXPORT_SYMBOL(dev_pick_tx_zero
);
3987 u16
dev_pick_tx_cpu_id(struct net_device
*dev
, struct sk_buff
*skb
,
3988 struct net_device
*sb_dev
)
3990 return (u16
)raw_smp_processor_id() % dev
->real_num_tx_queues
;
3992 EXPORT_SYMBOL(dev_pick_tx_cpu_id
);
3994 u16
netdev_pick_tx(struct net_device
*dev
, struct sk_buff
*skb
,
3995 struct net_device
*sb_dev
)
3997 struct sock
*sk
= skb
->sk
;
3998 int queue_index
= sk_tx_queue_get(sk
);
4000 sb_dev
= sb_dev
? : dev
;
4002 if (queue_index
< 0 || skb
->ooo_okay
||
4003 queue_index
>= dev
->real_num_tx_queues
) {
4004 int new_index
= get_xps_queue(dev
, sb_dev
, skb
);
4007 new_index
= skb_tx_hash(dev
, sb_dev
, skb
);
4009 if (queue_index
!= new_index
&& sk
&&
4011 rcu_access_pointer(sk
->sk_dst_cache
))
4012 sk_tx_queue_set(sk
, new_index
);
4014 queue_index
= new_index
;
4019 EXPORT_SYMBOL(netdev_pick_tx
);
4021 struct netdev_queue
*netdev_core_pick_tx(struct net_device
*dev
,
4022 struct sk_buff
*skb
,
4023 struct net_device
*sb_dev
)
4025 int queue_index
= 0;
4028 u32 sender_cpu
= skb
->sender_cpu
- 1;
4030 if (sender_cpu
>= (u32
)NR_CPUS
)
4031 skb
->sender_cpu
= raw_smp_processor_id() + 1;
4034 if (dev
->real_num_tx_queues
!= 1) {
4035 const struct net_device_ops
*ops
= dev
->netdev_ops
;
4037 if (ops
->ndo_select_queue
)
4038 queue_index
= ops
->ndo_select_queue(dev
, skb
, sb_dev
);
4040 queue_index
= netdev_pick_tx(dev
, skb
, sb_dev
);
4042 queue_index
= netdev_cap_txqueue(dev
, queue_index
);
4045 skb_set_queue_mapping(skb
, queue_index
);
4046 return netdev_get_tx_queue(dev
, queue_index
);
4050 * __dev_queue_xmit - transmit a buffer
4051 * @skb: buffer to transmit
4052 * @sb_dev: suboordinate device used for L2 forwarding offload
4054 * Queue a buffer for transmission to a network device. The caller must
4055 * have set the device and priority and built the buffer before calling
4056 * this function. The function can be called from an interrupt.
4058 * A negative errno code is returned on a failure. A success does not
4059 * guarantee the frame will be transmitted as it may be dropped due
4060 * to congestion or traffic shaping.
4062 * -----------------------------------------------------------------------------------
4063 * I notice this method can also return errors from the queue disciplines,
4064 * including NET_XMIT_DROP, which is a positive value. So, errors can also
4067 * Regardless of the return value, the skb is consumed, so it is currently
4068 * difficult to retry a send to this method. (You can bump the ref count
4069 * before sending to hold a reference for retry if you are careful.)
4071 * When calling this method, interrupts MUST be enabled. This is because
4072 * the BH enable code must have IRQs enabled so that it will not deadlock.
4075 static int __dev_queue_xmit(struct sk_buff
*skb
, struct net_device
*sb_dev
)
4077 struct net_device
*dev
= skb
->dev
;
4078 struct netdev_queue
*txq
;
4083 skb_reset_mac_header(skb
);
4085 if (unlikely(skb_shinfo(skb
)->tx_flags
& SKBTX_SCHED_TSTAMP
))
4086 __skb_tstamp_tx(skb
, NULL
, skb
->sk
, SCM_TSTAMP_SCHED
);
4088 /* Disable soft irqs for various locks below. Also
4089 * stops preemption for RCU.
4093 skb_update_prio(skb
);
4095 qdisc_pkt_len_init(skb
);
4096 #ifdef CONFIG_NET_CLS_ACT
4097 skb
->tc_at_ingress
= 0;
4098 # ifdef CONFIG_NET_EGRESS
4099 if (static_branch_unlikely(&egress_needed_key
)) {
4100 skb
= sch_handle_egress(skb
, &rc
, dev
);
4106 /* If device/qdisc don't need skb->dst, release it right now while
4107 * its hot in this cpu cache.
4109 if (dev
->priv_flags
& IFF_XMIT_DST_RELEASE
)
4114 txq
= netdev_core_pick_tx(dev
, skb
, sb_dev
);
4115 q
= rcu_dereference_bh(txq
->qdisc
);
4117 trace_net_dev_queue(skb
);
4119 rc
= __dev_xmit_skb(skb
, q
, dev
, txq
);
4123 /* The device has no queue. Common case for software devices:
4124 * loopback, all the sorts of tunnels...
4126 * Really, it is unlikely that netif_tx_lock protection is necessary
4127 * here. (f.e. loopback and IP tunnels are clean ignoring statistics
4129 * However, it is possible, that they rely on protection
4132 * Check this and shot the lock. It is not prone from deadlocks.
4133 *Either shot noqueue qdisc, it is even simpler 8)
4135 if (dev
->flags
& IFF_UP
) {
4136 int cpu
= smp_processor_id(); /* ok because BHs are off */
4138 if (txq
->xmit_lock_owner
!= cpu
) {
4139 if (dev_xmit_recursion())
4140 goto recursion_alert
;
4142 skb
= validate_xmit_skb(skb
, dev
, &again
);
4146 PRANDOM_ADD_NOISE(skb
, dev
, txq
, jiffies
);
4147 HARD_TX_LOCK(dev
, txq
, cpu
);
4149 if (!netif_xmit_stopped(txq
)) {
4150 dev_xmit_recursion_inc();
4151 skb
= dev_hard_start_xmit(skb
, dev
, txq
, &rc
);
4152 dev_xmit_recursion_dec();
4153 if (dev_xmit_complete(rc
)) {
4154 HARD_TX_UNLOCK(dev
, txq
);
4158 HARD_TX_UNLOCK(dev
, txq
);
4159 net_crit_ratelimited("Virtual device %s asks to queue packet!\n",
4162 /* Recursion is detected! It is possible,
4166 net_crit_ratelimited("Dead loop on virtual device %s, fix it urgently!\n",
4172 rcu_read_unlock_bh();
4174 atomic_long_inc(&dev
->tx_dropped
);
4175 kfree_skb_list(skb
);
4178 rcu_read_unlock_bh();
4182 int dev_queue_xmit(struct sk_buff
*skb
)
4184 return __dev_queue_xmit(skb
, NULL
);
4186 EXPORT_SYMBOL(dev_queue_xmit
);
4188 int dev_queue_xmit_accel(struct sk_buff
*skb
, struct net_device
*sb_dev
)
4190 return __dev_queue_xmit(skb
, sb_dev
);
4192 EXPORT_SYMBOL(dev_queue_xmit_accel
);
4194 int __dev_direct_xmit(struct sk_buff
*skb
, u16 queue_id
)
4196 struct net_device
*dev
= skb
->dev
;
4197 struct sk_buff
*orig_skb
= skb
;
4198 struct netdev_queue
*txq
;
4199 int ret
= NETDEV_TX_BUSY
;
4202 if (unlikely(!netif_running(dev
) ||
4203 !netif_carrier_ok(dev
)))
4206 skb
= validate_xmit_skb_list(skb
, dev
, &again
);
4207 if (skb
!= orig_skb
)
4210 skb_set_queue_mapping(skb
, queue_id
);
4211 txq
= skb_get_tx_queue(dev
, skb
);
4212 PRANDOM_ADD_NOISE(skb
, dev
, txq
, jiffies
);
4216 dev_xmit_recursion_inc();
4217 HARD_TX_LOCK(dev
, txq
, smp_processor_id());
4218 if (!netif_xmit_frozen_or_drv_stopped(txq
))
4219 ret
= netdev_start_xmit(skb
, dev
, txq
, false);
4220 HARD_TX_UNLOCK(dev
, txq
);
4221 dev_xmit_recursion_dec();
4226 atomic_long_inc(&dev
->tx_dropped
);
4227 kfree_skb_list(skb
);
4228 return NET_XMIT_DROP
;
4230 EXPORT_SYMBOL(__dev_direct_xmit
);
4232 /*************************************************************************
4234 *************************************************************************/
4236 int netdev_max_backlog __read_mostly
= 1000;
4237 EXPORT_SYMBOL(netdev_max_backlog
);
4239 int netdev_tstamp_prequeue __read_mostly
= 1;
4240 int netdev_budget __read_mostly
= 300;
4241 /* Must be at least 2 jiffes to guarantee 1 jiffy timeout */
4242 unsigned int __read_mostly netdev_budget_usecs
= 2 * USEC_PER_SEC
/ HZ
;
4243 int weight_p __read_mostly
= 64; /* old backlog weight */
4244 int dev_weight_rx_bias __read_mostly
= 1; /* bias for backlog weight */
4245 int dev_weight_tx_bias __read_mostly
= 1; /* bias for output_queue quota */
4246 int dev_rx_weight __read_mostly
= 64;
4247 int dev_tx_weight __read_mostly
= 64;
4248 /* Maximum number of GRO_NORMAL skbs to batch up for list-RX */
4249 int gro_normal_batch __read_mostly
= 8;
4251 /* Called with irq disabled */
4252 static inline void ____napi_schedule(struct softnet_data
*sd
,
4253 struct napi_struct
*napi
)
4255 list_add_tail(&napi
->poll_list
, &sd
->poll_list
);
4256 __raise_softirq_irqoff(NET_RX_SOFTIRQ
);
4261 /* One global table that all flow-based protocols share. */
4262 struct rps_sock_flow_table __rcu
*rps_sock_flow_table __read_mostly
;
4263 EXPORT_SYMBOL(rps_sock_flow_table
);
4264 u32 rps_cpu_mask __read_mostly
;
4265 EXPORT_SYMBOL(rps_cpu_mask
);
4267 struct static_key_false rps_needed __read_mostly
;
4268 EXPORT_SYMBOL(rps_needed
);
4269 struct static_key_false rfs_needed __read_mostly
;
4270 EXPORT_SYMBOL(rfs_needed
);
4272 static struct rps_dev_flow
*
4273 set_rps_cpu(struct net_device
*dev
, struct sk_buff
*skb
,
4274 struct rps_dev_flow
*rflow
, u16 next_cpu
)
4276 if (next_cpu
< nr_cpu_ids
) {
4277 #ifdef CONFIG_RFS_ACCEL
4278 struct netdev_rx_queue
*rxqueue
;
4279 struct rps_dev_flow_table
*flow_table
;
4280 struct rps_dev_flow
*old_rflow
;
4285 /* Should we steer this flow to a different hardware queue? */
4286 if (!skb_rx_queue_recorded(skb
) || !dev
->rx_cpu_rmap
||
4287 !(dev
->features
& NETIF_F_NTUPLE
))
4289 rxq_index
= cpu_rmap_lookup_index(dev
->rx_cpu_rmap
, next_cpu
);
4290 if (rxq_index
== skb_get_rx_queue(skb
))
4293 rxqueue
= dev
->_rx
+ rxq_index
;
4294 flow_table
= rcu_dereference(rxqueue
->rps_flow_table
);
4297 flow_id
= skb_get_hash(skb
) & flow_table
->mask
;
4298 rc
= dev
->netdev_ops
->ndo_rx_flow_steer(dev
, skb
,
4299 rxq_index
, flow_id
);
4303 rflow
= &flow_table
->flows
[flow_id
];
4305 if (old_rflow
->filter
== rflow
->filter
)
4306 old_rflow
->filter
= RPS_NO_FILTER
;
4310 per_cpu(softnet_data
, next_cpu
).input_queue_head
;
4313 rflow
->cpu
= next_cpu
;
4318 * get_rps_cpu is called from netif_receive_skb and returns the target
4319 * CPU from the RPS map of the receiving queue for a given skb.
4320 * rcu_read_lock must be held on entry.
4322 static int get_rps_cpu(struct net_device
*dev
, struct sk_buff
*skb
,
4323 struct rps_dev_flow
**rflowp
)
4325 const struct rps_sock_flow_table
*sock_flow_table
;
4326 struct netdev_rx_queue
*rxqueue
= dev
->_rx
;
4327 struct rps_dev_flow_table
*flow_table
;
4328 struct rps_map
*map
;
4333 if (skb_rx_queue_recorded(skb
)) {
4334 u16 index
= skb_get_rx_queue(skb
);
4336 if (unlikely(index
>= dev
->real_num_rx_queues
)) {
4337 WARN_ONCE(dev
->real_num_rx_queues
> 1,
4338 "%s received packet on queue %u, but number "
4339 "of RX queues is %u\n",
4340 dev
->name
, index
, dev
->real_num_rx_queues
);
4346 /* Avoid computing hash if RFS/RPS is not active for this rxqueue */
4348 flow_table
= rcu_dereference(rxqueue
->rps_flow_table
);
4349 map
= rcu_dereference(rxqueue
->rps_map
);
4350 if (!flow_table
&& !map
)
4353 skb_reset_network_header(skb
);
4354 hash
= skb_get_hash(skb
);
4358 sock_flow_table
= rcu_dereference(rps_sock_flow_table
);
4359 if (flow_table
&& sock_flow_table
) {
4360 struct rps_dev_flow
*rflow
;
4364 /* First check into global flow table if there is a match */
4365 ident
= sock_flow_table
->ents
[hash
& sock_flow_table
->mask
];
4366 if ((ident
^ hash
) & ~rps_cpu_mask
)
4369 next_cpu
= ident
& rps_cpu_mask
;
4371 /* OK, now we know there is a match,
4372 * we can look at the local (per receive queue) flow table
4374 rflow
= &flow_table
->flows
[hash
& flow_table
->mask
];
4378 * If the desired CPU (where last recvmsg was done) is
4379 * different from current CPU (one in the rx-queue flow
4380 * table entry), switch if one of the following holds:
4381 * - Current CPU is unset (>= nr_cpu_ids).
4382 * - Current CPU is offline.
4383 * - The current CPU's queue tail has advanced beyond the
4384 * last packet that was enqueued using this table entry.
4385 * This guarantees that all previous packets for the flow
4386 * have been dequeued, thus preserving in order delivery.
4388 if (unlikely(tcpu
!= next_cpu
) &&
4389 (tcpu
>= nr_cpu_ids
|| !cpu_online(tcpu
) ||
4390 ((int)(per_cpu(softnet_data
, tcpu
).input_queue_head
-
4391 rflow
->last_qtail
)) >= 0)) {
4393 rflow
= set_rps_cpu(dev
, skb
, rflow
, next_cpu
);
4396 if (tcpu
< nr_cpu_ids
&& cpu_online(tcpu
)) {
4406 tcpu
= map
->cpus
[reciprocal_scale(hash
, map
->len
)];
4407 if (cpu_online(tcpu
)) {
4417 #ifdef CONFIG_RFS_ACCEL
4420 * rps_may_expire_flow - check whether an RFS hardware filter may be removed
4421 * @dev: Device on which the filter was set
4422 * @rxq_index: RX queue index
4423 * @flow_id: Flow ID passed to ndo_rx_flow_steer()
4424 * @filter_id: Filter ID returned by ndo_rx_flow_steer()
4426 * Drivers that implement ndo_rx_flow_steer() should periodically call
4427 * this function for each installed filter and remove the filters for
4428 * which it returns %true.
4430 bool rps_may_expire_flow(struct net_device
*dev
, u16 rxq_index
,
4431 u32 flow_id
, u16 filter_id
)
4433 struct netdev_rx_queue
*rxqueue
= dev
->_rx
+ rxq_index
;
4434 struct rps_dev_flow_table
*flow_table
;
4435 struct rps_dev_flow
*rflow
;
4440 flow_table
= rcu_dereference(rxqueue
->rps_flow_table
);
4441 if (flow_table
&& flow_id
<= flow_table
->mask
) {
4442 rflow
= &flow_table
->flows
[flow_id
];
4443 cpu
= READ_ONCE(rflow
->cpu
);
4444 if (rflow
->filter
== filter_id
&& cpu
< nr_cpu_ids
&&
4445 ((int)(per_cpu(softnet_data
, cpu
).input_queue_head
-
4446 rflow
->last_qtail
) <
4447 (int)(10 * flow_table
->mask
)))
4453 EXPORT_SYMBOL(rps_may_expire_flow
);
4455 #endif /* CONFIG_RFS_ACCEL */
4457 /* Called from hardirq (IPI) context */
4458 static void rps_trigger_softirq(void *data
)
4460 struct softnet_data
*sd
= data
;
4462 ____napi_schedule(sd
, &sd
->backlog
);
4466 #endif /* CONFIG_RPS */
4469 * Check if this softnet_data structure is another cpu one
4470 * If yes, queue it to our IPI list and return 1
4473 static int rps_ipi_queued(struct softnet_data
*sd
)
4476 struct softnet_data
*mysd
= this_cpu_ptr(&softnet_data
);
4479 sd
->rps_ipi_next
= mysd
->rps_ipi_list
;
4480 mysd
->rps_ipi_list
= sd
;
4482 __raise_softirq_irqoff(NET_RX_SOFTIRQ
);
4485 #endif /* CONFIG_RPS */
4489 #ifdef CONFIG_NET_FLOW_LIMIT
4490 int netdev_flow_limit_table_len __read_mostly
= (1 << 12);
4493 static bool skb_flow_limit(struct sk_buff
*skb
, unsigned int qlen
)
4495 #ifdef CONFIG_NET_FLOW_LIMIT
4496 struct sd_flow_limit
*fl
;
4497 struct softnet_data
*sd
;
4498 unsigned int old_flow
, new_flow
;
4500 if (qlen
< (netdev_max_backlog
>> 1))
4503 sd
= this_cpu_ptr(&softnet_data
);
4506 fl
= rcu_dereference(sd
->flow_limit
);
4508 new_flow
= skb_get_hash(skb
) & (fl
->num_buckets
- 1);
4509 old_flow
= fl
->history
[fl
->history_head
];
4510 fl
->history
[fl
->history_head
] = new_flow
;
4513 fl
->history_head
&= FLOW_LIMIT_HISTORY
- 1;
4515 if (likely(fl
->buckets
[old_flow
]))
4516 fl
->buckets
[old_flow
]--;
4518 if (++fl
->buckets
[new_flow
] > (FLOW_LIMIT_HISTORY
>> 1)) {
4530 * enqueue_to_backlog is called to queue an skb to a per CPU backlog
4531 * queue (may be a remote CPU queue).
4533 static int enqueue_to_backlog(struct sk_buff
*skb
, int cpu
,
4534 unsigned int *qtail
)
4536 struct softnet_data
*sd
;
4537 unsigned long flags
;
4540 sd
= &per_cpu(softnet_data
, cpu
);
4542 local_irq_save(flags
);
4545 if (!netif_running(skb
->dev
))
4547 qlen
= skb_queue_len(&sd
->input_pkt_queue
);
4548 if (qlen
<= netdev_max_backlog
&& !skb_flow_limit(skb
, qlen
)) {
4551 __skb_queue_tail(&sd
->input_pkt_queue
, skb
);
4552 input_queue_tail_incr_save(sd
, qtail
);
4554 local_irq_restore(flags
);
4555 return NET_RX_SUCCESS
;
4558 /* Schedule NAPI for backlog device
4559 * We can use non atomic operation since we own the queue lock
4561 if (!__test_and_set_bit(NAPI_STATE_SCHED
, &sd
->backlog
.state
)) {
4562 if (!rps_ipi_queued(sd
))
4563 ____napi_schedule(sd
, &sd
->backlog
);
4572 local_irq_restore(flags
);
4574 atomic_long_inc(&skb
->dev
->rx_dropped
);
4579 static struct netdev_rx_queue
*netif_get_rxqueue(struct sk_buff
*skb
)
4581 struct net_device
*dev
= skb
->dev
;
4582 struct netdev_rx_queue
*rxqueue
;
4586 if (skb_rx_queue_recorded(skb
)) {
4587 u16 index
= skb_get_rx_queue(skb
);
4589 if (unlikely(index
>= dev
->real_num_rx_queues
)) {
4590 WARN_ONCE(dev
->real_num_rx_queues
> 1,
4591 "%s received packet on queue %u, but number "
4592 "of RX queues is %u\n",
4593 dev
->name
, index
, dev
->real_num_rx_queues
);
4595 return rxqueue
; /* Return first rxqueue */
4602 static u32
netif_receive_generic_xdp(struct sk_buff
*skb
,
4603 struct xdp_buff
*xdp
,
4604 struct bpf_prog
*xdp_prog
)
4606 struct netdev_rx_queue
*rxqueue
;
4607 void *orig_data
, *orig_data_end
;
4608 u32 metalen
, act
= XDP_DROP
;
4609 __be16 orig_eth_type
;
4615 /* Reinjected packets coming from act_mirred or similar should
4616 * not get XDP generic processing.
4618 if (skb_is_redirected(skb
))
4621 /* XDP packets must be linear and must have sufficient headroom
4622 * of XDP_PACKET_HEADROOM bytes. This is the guarantee that also
4623 * native XDP provides, thus we need to do it here as well.
4625 if (skb_cloned(skb
) || skb_is_nonlinear(skb
) ||
4626 skb_headroom(skb
) < XDP_PACKET_HEADROOM
) {
4627 int hroom
= XDP_PACKET_HEADROOM
- skb_headroom(skb
);
4628 int troom
= skb
->tail
+ skb
->data_len
- skb
->end
;
4630 /* In case we have to go down the path and also linearize,
4631 * then lets do the pskb_expand_head() work just once here.
4633 if (pskb_expand_head(skb
,
4634 hroom
> 0 ? ALIGN(hroom
, NET_SKB_PAD
) : 0,
4635 troom
> 0 ? troom
+ 128 : 0, GFP_ATOMIC
))
4637 if (skb_linearize(skb
))
4641 /* The XDP program wants to see the packet starting at the MAC
4644 mac_len
= skb
->data
- skb_mac_header(skb
);
4645 hlen
= skb_headlen(skb
) + mac_len
;
4646 xdp
->data
= skb
->data
- mac_len
;
4647 xdp
->data_meta
= xdp
->data
;
4648 xdp
->data_end
= xdp
->data
+ hlen
;
4649 xdp
->data_hard_start
= skb
->data
- skb_headroom(skb
);
4651 /* SKB "head" area always have tailroom for skb_shared_info */
4652 xdp
->frame_sz
= (void *)skb_end_pointer(skb
) - xdp
->data_hard_start
;
4653 xdp
->frame_sz
+= SKB_DATA_ALIGN(sizeof(struct skb_shared_info
));
4655 orig_data_end
= xdp
->data_end
;
4656 orig_data
= xdp
->data
;
4657 eth
= (struct ethhdr
*)xdp
->data
;
4658 orig_bcast
= is_multicast_ether_addr_64bits(eth
->h_dest
);
4659 orig_eth_type
= eth
->h_proto
;
4661 rxqueue
= netif_get_rxqueue(skb
);
4662 xdp
->rxq
= &rxqueue
->xdp_rxq
;
4664 act
= bpf_prog_run_xdp(xdp_prog
, xdp
);
4666 /* check if bpf_xdp_adjust_head was used */
4667 off
= xdp
->data
- orig_data
;
4670 __skb_pull(skb
, off
);
4672 __skb_push(skb
, -off
);
4674 skb
->mac_header
+= off
;
4675 skb_reset_network_header(skb
);
4678 /* check if bpf_xdp_adjust_tail was used */
4679 off
= xdp
->data_end
- orig_data_end
;
4681 skb_set_tail_pointer(skb
, xdp
->data_end
- xdp
->data
);
4682 skb
->len
+= off
; /* positive on grow, negative on shrink */
4685 /* check if XDP changed eth hdr such SKB needs update */
4686 eth
= (struct ethhdr
*)xdp
->data
;
4687 if ((orig_eth_type
!= eth
->h_proto
) ||
4688 (orig_bcast
!= is_multicast_ether_addr_64bits(eth
->h_dest
))) {
4689 __skb_push(skb
, ETH_HLEN
);
4690 skb
->protocol
= eth_type_trans(skb
, skb
->dev
);
4696 __skb_push(skb
, mac_len
);
4699 metalen
= xdp
->data
- xdp
->data_meta
;
4701 skb_metadata_set(skb
, metalen
);
4704 bpf_warn_invalid_xdp_action(act
);
4707 trace_xdp_exception(skb
->dev
, xdp_prog
, act
);
4718 /* When doing generic XDP we have to bypass the qdisc layer and the
4719 * network taps in order to match in-driver-XDP behavior.
4721 void generic_xdp_tx(struct sk_buff
*skb
, struct bpf_prog
*xdp_prog
)
4723 struct net_device
*dev
= skb
->dev
;
4724 struct netdev_queue
*txq
;
4725 bool free_skb
= true;
4728 txq
= netdev_core_pick_tx(dev
, skb
, NULL
);
4729 cpu
= smp_processor_id();
4730 HARD_TX_LOCK(dev
, txq
, cpu
);
4731 if (!netif_xmit_stopped(txq
)) {
4732 rc
= netdev_start_xmit(skb
, dev
, txq
, 0);
4733 if (dev_xmit_complete(rc
))
4736 HARD_TX_UNLOCK(dev
, txq
);
4738 trace_xdp_exception(dev
, xdp_prog
, XDP_TX
);
4743 static DEFINE_STATIC_KEY_FALSE(generic_xdp_needed_key
);
4745 int do_xdp_generic(struct bpf_prog
*xdp_prog
, struct sk_buff
*skb
)
4748 struct xdp_buff xdp
;
4752 act
= netif_receive_generic_xdp(skb
, &xdp
, xdp_prog
);
4753 if (act
!= XDP_PASS
) {
4756 err
= xdp_do_generic_redirect(skb
->dev
, skb
,
4762 generic_xdp_tx(skb
, xdp_prog
);
4773 EXPORT_SYMBOL_GPL(do_xdp_generic
);
4775 static int netif_rx_internal(struct sk_buff
*skb
)
4779 net_timestamp_check(netdev_tstamp_prequeue
, skb
);
4781 trace_netif_rx(skb
);
4784 if (static_branch_unlikely(&rps_needed
)) {
4785 struct rps_dev_flow voidflow
, *rflow
= &voidflow
;
4791 cpu
= get_rps_cpu(skb
->dev
, skb
, &rflow
);
4793 cpu
= smp_processor_id();
4795 ret
= enqueue_to_backlog(skb
, cpu
, &rflow
->last_qtail
);
4804 ret
= enqueue_to_backlog(skb
, get_cpu(), &qtail
);
4811 * netif_rx - post buffer to the network code
4812 * @skb: buffer to post
4814 * This function receives a packet from a device driver and queues it for
4815 * the upper (protocol) levels to process. It always succeeds. The buffer
4816 * may be dropped during processing for congestion control or by the
4820 * NET_RX_SUCCESS (no congestion)
4821 * NET_RX_DROP (packet was dropped)
4825 int netif_rx(struct sk_buff
*skb
)
4829 trace_netif_rx_entry(skb
);
4831 ret
= netif_rx_internal(skb
);
4832 trace_netif_rx_exit(ret
);
4836 EXPORT_SYMBOL(netif_rx
);
4838 int netif_rx_ni(struct sk_buff
*skb
)
4842 trace_netif_rx_ni_entry(skb
);
4845 err
= netif_rx_internal(skb
);
4846 if (local_softirq_pending())
4849 trace_netif_rx_ni_exit(err
);
4853 EXPORT_SYMBOL(netif_rx_ni
);
4855 int netif_rx_any_context(struct sk_buff
*skb
)
4858 * If invoked from contexts which do not invoke bottom half
4859 * processing either at return from interrupt or when softrqs are
4860 * reenabled, use netif_rx_ni() which invokes bottomhalf processing
4864 return netif_rx(skb
);
4866 return netif_rx_ni(skb
);
4868 EXPORT_SYMBOL(netif_rx_any_context
);
4870 static __latent_entropy
void net_tx_action(struct softirq_action
*h
)
4872 struct softnet_data
*sd
= this_cpu_ptr(&softnet_data
);
4874 if (sd
->completion_queue
) {
4875 struct sk_buff
*clist
;
4877 local_irq_disable();
4878 clist
= sd
->completion_queue
;
4879 sd
->completion_queue
= NULL
;
4883 struct sk_buff
*skb
= clist
;
4885 clist
= clist
->next
;
4887 WARN_ON(refcount_read(&skb
->users
));
4888 if (likely(get_kfree_skb_cb(skb
)->reason
== SKB_REASON_CONSUMED
))
4889 trace_consume_skb(skb
);
4891 trace_kfree_skb(skb
, net_tx_action
);
4893 if (skb
->fclone
!= SKB_FCLONE_UNAVAILABLE
)
4896 __kfree_skb_defer(skb
);
4899 __kfree_skb_flush();
4902 if (sd
->output_queue
) {
4905 local_irq_disable();
4906 head
= sd
->output_queue
;
4907 sd
->output_queue
= NULL
;
4908 sd
->output_queue_tailp
= &sd
->output_queue
;
4912 struct Qdisc
*q
= head
;
4913 spinlock_t
*root_lock
= NULL
;
4915 head
= head
->next_sched
;
4917 if (!(q
->flags
& TCQ_F_NOLOCK
)) {
4918 root_lock
= qdisc_lock(q
);
4919 spin_lock(root_lock
);
4921 /* We need to make sure head->next_sched is read
4922 * before clearing __QDISC_STATE_SCHED
4924 smp_mb__before_atomic();
4925 clear_bit(__QDISC_STATE_SCHED
, &q
->state
);
4928 spin_unlock(root_lock
);
4932 xfrm_dev_backlog(sd
);
4935 #if IS_ENABLED(CONFIG_BRIDGE) && IS_ENABLED(CONFIG_ATM_LANE)
4936 /* This hook is defined here for ATM LANE */
4937 int (*br_fdb_test_addr_hook
)(struct net_device
*dev
,
4938 unsigned char *addr
) __read_mostly
;
4939 EXPORT_SYMBOL_GPL(br_fdb_test_addr_hook
);
4942 static inline struct sk_buff
*
4943 sch_handle_ingress(struct sk_buff
*skb
, struct packet_type
**pt_prev
, int *ret
,
4944 struct net_device
*orig_dev
, bool *another
)
4946 #ifdef CONFIG_NET_CLS_ACT
4947 struct mini_Qdisc
*miniq
= rcu_dereference_bh(skb
->dev
->miniq_ingress
);
4948 struct tcf_result cl_res
;
4950 /* If there's at least one ingress present somewhere (so
4951 * we get here via enabled static key), remaining devices
4952 * that are not configured with an ingress qdisc will bail
4959 *ret
= deliver_skb(skb
, *pt_prev
, orig_dev
);
4963 qdisc_skb_cb(skb
)->pkt_len
= skb
->len
;
4964 qdisc_skb_cb(skb
)->mru
= 0;
4965 skb
->tc_at_ingress
= 1;
4966 mini_qdisc_bstats_cpu_update(miniq
, skb
);
4968 switch (tcf_classify_ingress(skb
, miniq
->block
, miniq
->filter_list
,
4971 case TC_ACT_RECLASSIFY
:
4972 skb
->tc_index
= TC_H_MIN(cl_res
.classid
);
4975 mini_qdisc_qstats_cpu_drop(miniq
);
4983 case TC_ACT_REDIRECT
:
4984 /* skb_mac_header check was done by cls/act_bpf, so
4985 * we can safely push the L2 header back before
4986 * redirecting to another netdev
4988 __skb_push(skb
, skb
->mac_len
);
4989 if (skb_do_redirect(skb
) == -EAGAIN
) {
4990 __skb_pull(skb
, skb
->mac_len
);
4995 case TC_ACT_CONSUMED
:
5000 #endif /* CONFIG_NET_CLS_ACT */
5005 * netdev_is_rx_handler_busy - check if receive handler is registered
5006 * @dev: device to check
5008 * Check if a receive handler is already registered for a given device.
5009 * Return true if there one.
5011 * The caller must hold the rtnl_mutex.
5013 bool netdev_is_rx_handler_busy(struct net_device
*dev
)
5016 return dev
&& rtnl_dereference(dev
->rx_handler
);
5018 EXPORT_SYMBOL_GPL(netdev_is_rx_handler_busy
);
5021 * netdev_rx_handler_register - register receive handler
5022 * @dev: device to register a handler for
5023 * @rx_handler: receive handler to register
5024 * @rx_handler_data: data pointer that is used by rx handler
5026 * Register a receive handler for a device. This handler will then be
5027 * called from __netif_receive_skb. A negative errno code is returned
5030 * The caller must hold the rtnl_mutex.
5032 * For a general description of rx_handler, see enum rx_handler_result.
5034 int netdev_rx_handler_register(struct net_device
*dev
,
5035 rx_handler_func_t
*rx_handler
,
5036 void *rx_handler_data
)
5038 if (netdev_is_rx_handler_busy(dev
))
5041 if (dev
->priv_flags
& IFF_NO_RX_HANDLER
)
5044 /* Note: rx_handler_data must be set before rx_handler */
5045 rcu_assign_pointer(dev
->rx_handler_data
, rx_handler_data
);
5046 rcu_assign_pointer(dev
->rx_handler
, rx_handler
);
5050 EXPORT_SYMBOL_GPL(netdev_rx_handler_register
);
5053 * netdev_rx_handler_unregister - unregister receive handler
5054 * @dev: device to unregister a handler from
5056 * Unregister a receive handler from a device.
5058 * The caller must hold the rtnl_mutex.
5060 void netdev_rx_handler_unregister(struct net_device
*dev
)
5064 RCU_INIT_POINTER(dev
->rx_handler
, NULL
);
5065 /* a reader seeing a non NULL rx_handler in a rcu_read_lock()
5066 * section has a guarantee to see a non NULL rx_handler_data
5070 RCU_INIT_POINTER(dev
->rx_handler_data
, NULL
);
5072 EXPORT_SYMBOL_GPL(netdev_rx_handler_unregister
);
5075 * Limit the use of PFMEMALLOC reserves to those protocols that implement
5076 * the special handling of PFMEMALLOC skbs.
5078 static bool skb_pfmemalloc_protocol(struct sk_buff
*skb
)
5080 switch (skb
->protocol
) {
5081 case htons(ETH_P_ARP
):
5082 case htons(ETH_P_IP
):
5083 case htons(ETH_P_IPV6
):
5084 case htons(ETH_P_8021Q
):
5085 case htons(ETH_P_8021AD
):
5092 static inline int nf_ingress(struct sk_buff
*skb
, struct packet_type
**pt_prev
,
5093 int *ret
, struct net_device
*orig_dev
)
5095 if (nf_hook_ingress_active(skb
)) {
5099 *ret
= deliver_skb(skb
, *pt_prev
, orig_dev
);
5104 ingress_retval
= nf_hook_ingress(skb
);
5106 return ingress_retval
;
5111 static int __netif_receive_skb_core(struct sk_buff
**pskb
, bool pfmemalloc
,
5112 struct packet_type
**ppt_prev
)
5114 struct packet_type
*ptype
, *pt_prev
;
5115 rx_handler_func_t
*rx_handler
;
5116 struct sk_buff
*skb
= *pskb
;
5117 struct net_device
*orig_dev
;
5118 bool deliver_exact
= false;
5119 int ret
= NET_RX_DROP
;
5122 net_timestamp_check(!netdev_tstamp_prequeue
, skb
);
5124 trace_netif_receive_skb(skb
);
5126 orig_dev
= skb
->dev
;
5128 skb_reset_network_header(skb
);
5129 if (!skb_transport_header_was_set(skb
))
5130 skb_reset_transport_header(skb
);
5131 skb_reset_mac_len(skb
);
5136 skb
->skb_iif
= skb
->dev
->ifindex
;
5138 __this_cpu_inc(softnet_data
.processed
);
5140 if (static_branch_unlikely(&generic_xdp_needed_key
)) {
5144 ret2
= do_xdp_generic(rcu_dereference(skb
->dev
->xdp_prog
), skb
);
5147 if (ret2
!= XDP_PASS
) {
5151 skb_reset_mac_len(skb
);
5154 if (skb
->protocol
== cpu_to_be16(ETH_P_8021Q
) ||
5155 skb
->protocol
== cpu_to_be16(ETH_P_8021AD
)) {
5156 skb
= skb_vlan_untag(skb
);
5161 if (skb_skip_tc_classify(skb
))
5167 list_for_each_entry_rcu(ptype
, &ptype_all
, list
) {
5169 ret
= deliver_skb(skb
, pt_prev
, orig_dev
);
5173 list_for_each_entry_rcu(ptype
, &skb
->dev
->ptype_all
, list
) {
5175 ret
= deliver_skb(skb
, pt_prev
, orig_dev
);
5180 #ifdef CONFIG_NET_INGRESS
5181 if (static_branch_unlikely(&ingress_needed_key
)) {
5182 bool another
= false;
5184 skb
= sch_handle_ingress(skb
, &pt_prev
, &ret
, orig_dev
,
5191 if (nf_ingress(skb
, &pt_prev
, &ret
, orig_dev
) < 0)
5195 skb_reset_redirect(skb
);
5197 if (pfmemalloc
&& !skb_pfmemalloc_protocol(skb
))
5200 if (skb_vlan_tag_present(skb
)) {
5202 ret
= deliver_skb(skb
, pt_prev
, orig_dev
);
5205 if (vlan_do_receive(&skb
))
5207 else if (unlikely(!skb
))
5211 rx_handler
= rcu_dereference(skb
->dev
->rx_handler
);
5214 ret
= deliver_skb(skb
, pt_prev
, orig_dev
);
5217 switch (rx_handler(&skb
)) {
5218 case RX_HANDLER_CONSUMED
:
5219 ret
= NET_RX_SUCCESS
;
5221 case RX_HANDLER_ANOTHER
:
5223 case RX_HANDLER_EXACT
:
5224 deliver_exact
= true;
5225 case RX_HANDLER_PASS
:
5232 if (unlikely(skb_vlan_tag_present(skb
)) && !netdev_uses_dsa(skb
->dev
)) {
5234 if (skb_vlan_tag_get_id(skb
)) {
5235 /* Vlan id is non 0 and vlan_do_receive() above couldn't
5238 skb
->pkt_type
= PACKET_OTHERHOST
;
5239 } else if (skb
->protocol
== cpu_to_be16(ETH_P_8021Q
) ||
5240 skb
->protocol
== cpu_to_be16(ETH_P_8021AD
)) {
5241 /* Outer header is 802.1P with vlan 0, inner header is
5242 * 802.1Q or 802.1AD and vlan_do_receive() above could
5243 * not find vlan dev for vlan id 0.
5245 __vlan_hwaccel_clear_tag(skb
);
5246 skb
= skb_vlan_untag(skb
);
5249 if (vlan_do_receive(&skb
))
5250 /* After stripping off 802.1P header with vlan 0
5251 * vlan dev is found for inner header.
5254 else if (unlikely(!skb
))
5257 /* We have stripped outer 802.1P vlan 0 header.
5258 * But could not find vlan dev.
5259 * check again for vlan id to set OTHERHOST.
5263 /* Note: we might in the future use prio bits
5264 * and set skb->priority like in vlan_do_receive()
5265 * For the time being, just ignore Priority Code Point
5267 __vlan_hwaccel_clear_tag(skb
);
5270 type
= skb
->protocol
;
5272 /* deliver only exact match when indicated */
5273 if (likely(!deliver_exact
)) {
5274 deliver_ptype_list_skb(skb
, &pt_prev
, orig_dev
, type
,
5275 &ptype_base
[ntohs(type
) &
5279 deliver_ptype_list_skb(skb
, &pt_prev
, orig_dev
, type
,
5280 &orig_dev
->ptype_specific
);
5282 if (unlikely(skb
->dev
!= orig_dev
)) {
5283 deliver_ptype_list_skb(skb
, &pt_prev
, orig_dev
, type
,
5284 &skb
->dev
->ptype_specific
);
5288 if (unlikely(skb_orphan_frags_rx(skb
, GFP_ATOMIC
)))
5290 *ppt_prev
= pt_prev
;
5294 atomic_long_inc(&skb
->dev
->rx_dropped
);
5296 atomic_long_inc(&skb
->dev
->rx_nohandler
);
5298 /* Jamal, now you will not able to escape explaining
5299 * me how you were going to use this. :-)
5305 /* The invariant here is that if *ppt_prev is not NULL
5306 * then skb should also be non-NULL.
5308 * Apparently *ppt_prev assignment above holds this invariant due to
5309 * skb dereferencing near it.
5315 static int __netif_receive_skb_one_core(struct sk_buff
*skb
, bool pfmemalloc
)
5317 struct net_device
*orig_dev
= skb
->dev
;
5318 struct packet_type
*pt_prev
= NULL
;
5321 ret
= __netif_receive_skb_core(&skb
, pfmemalloc
, &pt_prev
);
5323 ret
= INDIRECT_CALL_INET(pt_prev
->func
, ipv6_rcv
, ip_rcv
, skb
,
5324 skb
->dev
, pt_prev
, orig_dev
);
5329 * netif_receive_skb_core - special purpose version of netif_receive_skb
5330 * @skb: buffer to process
5332 * More direct receive version of netif_receive_skb(). It should
5333 * only be used by callers that have a need to skip RPS and Generic XDP.
5334 * Caller must also take care of handling if ``(page_is_)pfmemalloc``.
5336 * This function may only be called from softirq context and interrupts
5337 * should be enabled.
5339 * Return values (usually ignored):
5340 * NET_RX_SUCCESS: no congestion
5341 * NET_RX_DROP: packet was dropped
5343 int netif_receive_skb_core(struct sk_buff
*skb
)
5348 ret
= __netif_receive_skb_one_core(skb
, false);
5353 EXPORT_SYMBOL(netif_receive_skb_core
);
5355 static inline void __netif_receive_skb_list_ptype(struct list_head
*head
,
5356 struct packet_type
*pt_prev
,
5357 struct net_device
*orig_dev
)
5359 struct sk_buff
*skb
, *next
;
5363 if (list_empty(head
))
5365 if (pt_prev
->list_func
!= NULL
)
5366 INDIRECT_CALL_INET(pt_prev
->list_func
, ipv6_list_rcv
,
5367 ip_list_rcv
, head
, pt_prev
, orig_dev
);
5369 list_for_each_entry_safe(skb
, next
, head
, list
) {
5370 skb_list_del_init(skb
);
5371 pt_prev
->func(skb
, skb
->dev
, pt_prev
, orig_dev
);
5375 static void __netif_receive_skb_list_core(struct list_head
*head
, bool pfmemalloc
)
5377 /* Fast-path assumptions:
5378 * - There is no RX handler.
5379 * - Only one packet_type matches.
5380 * If either of these fails, we will end up doing some per-packet
5381 * processing in-line, then handling the 'last ptype' for the whole
5382 * sublist. This can't cause out-of-order delivery to any single ptype,
5383 * because the 'last ptype' must be constant across the sublist, and all
5384 * other ptypes are handled per-packet.
5386 /* Current (common) ptype of sublist */
5387 struct packet_type
*pt_curr
= NULL
;
5388 /* Current (common) orig_dev of sublist */
5389 struct net_device
*od_curr
= NULL
;
5390 struct list_head sublist
;
5391 struct sk_buff
*skb
, *next
;
5393 INIT_LIST_HEAD(&sublist
);
5394 list_for_each_entry_safe(skb
, next
, head
, list
) {
5395 struct net_device
*orig_dev
= skb
->dev
;
5396 struct packet_type
*pt_prev
= NULL
;
5398 skb_list_del_init(skb
);
5399 __netif_receive_skb_core(&skb
, pfmemalloc
, &pt_prev
);
5402 if (pt_curr
!= pt_prev
|| od_curr
!= orig_dev
) {
5403 /* dispatch old sublist */
5404 __netif_receive_skb_list_ptype(&sublist
, pt_curr
, od_curr
);
5405 /* start new sublist */
5406 INIT_LIST_HEAD(&sublist
);
5410 list_add_tail(&skb
->list
, &sublist
);
5413 /* dispatch final sublist */
5414 __netif_receive_skb_list_ptype(&sublist
, pt_curr
, od_curr
);
5417 static int __netif_receive_skb(struct sk_buff
*skb
)
5421 if (sk_memalloc_socks() && skb_pfmemalloc(skb
)) {
5422 unsigned int noreclaim_flag
;
5425 * PFMEMALLOC skbs are special, they should
5426 * - be delivered to SOCK_MEMALLOC sockets only
5427 * - stay away from userspace
5428 * - have bounded memory usage
5430 * Use PF_MEMALLOC as this saves us from propagating the allocation
5431 * context down to all allocation sites.
5433 noreclaim_flag
= memalloc_noreclaim_save();
5434 ret
= __netif_receive_skb_one_core(skb
, true);
5435 memalloc_noreclaim_restore(noreclaim_flag
);
5437 ret
= __netif_receive_skb_one_core(skb
, false);
5442 static void __netif_receive_skb_list(struct list_head
*head
)
5444 unsigned long noreclaim_flag
= 0;
5445 struct sk_buff
*skb
, *next
;
5446 bool pfmemalloc
= false; /* Is current sublist PF_MEMALLOC? */
5448 list_for_each_entry_safe(skb
, next
, head
, list
) {
5449 if ((sk_memalloc_socks() && skb_pfmemalloc(skb
)) != pfmemalloc
) {
5450 struct list_head sublist
;
5452 /* Handle the previous sublist */
5453 list_cut_before(&sublist
, head
, &skb
->list
);
5454 if (!list_empty(&sublist
))
5455 __netif_receive_skb_list_core(&sublist
, pfmemalloc
);
5456 pfmemalloc
= !pfmemalloc
;
5457 /* See comments in __netif_receive_skb */
5459 noreclaim_flag
= memalloc_noreclaim_save();
5461 memalloc_noreclaim_restore(noreclaim_flag
);
5464 /* Handle the remaining sublist */
5465 if (!list_empty(head
))
5466 __netif_receive_skb_list_core(head
, pfmemalloc
);
5467 /* Restore pflags */
5469 memalloc_noreclaim_restore(noreclaim_flag
);
5472 static int generic_xdp_install(struct net_device
*dev
, struct netdev_bpf
*xdp
)
5474 struct bpf_prog
*old
= rtnl_dereference(dev
->xdp_prog
);
5475 struct bpf_prog
*new = xdp
->prog
;
5481 mutex_lock(&new->aux
->used_maps_mutex
);
5483 /* generic XDP does not work with DEVMAPs that can
5484 * have a bpf_prog installed on an entry
5486 for (i
= 0; i
< new->aux
->used_map_cnt
; i
++) {
5487 if (dev_map_can_have_prog(new->aux
->used_maps
[i
]) ||
5488 cpu_map_prog_allowed(new->aux
->used_maps
[i
])) {
5489 mutex_unlock(&new->aux
->used_maps_mutex
);
5494 mutex_unlock(&new->aux
->used_maps_mutex
);
5497 switch (xdp
->command
) {
5498 case XDP_SETUP_PROG
:
5499 rcu_assign_pointer(dev
->xdp_prog
, new);
5504 static_branch_dec(&generic_xdp_needed_key
);
5505 } else if (new && !old
) {
5506 static_branch_inc(&generic_xdp_needed_key
);
5507 dev_disable_lro(dev
);
5508 dev_disable_gro_hw(dev
);
5520 static int netif_receive_skb_internal(struct sk_buff
*skb
)
5524 net_timestamp_check(netdev_tstamp_prequeue
, skb
);
5526 if (skb_defer_rx_timestamp(skb
))
5527 return NET_RX_SUCCESS
;
5531 if (static_branch_unlikely(&rps_needed
)) {
5532 struct rps_dev_flow voidflow
, *rflow
= &voidflow
;
5533 int cpu
= get_rps_cpu(skb
->dev
, skb
, &rflow
);
5536 ret
= enqueue_to_backlog(skb
, cpu
, &rflow
->last_qtail
);
5542 ret
= __netif_receive_skb(skb
);
5547 static void netif_receive_skb_list_internal(struct list_head
*head
)
5549 struct sk_buff
*skb
, *next
;
5550 struct list_head sublist
;
5552 INIT_LIST_HEAD(&sublist
);
5553 list_for_each_entry_safe(skb
, next
, head
, list
) {
5554 net_timestamp_check(netdev_tstamp_prequeue
, skb
);
5555 skb_list_del_init(skb
);
5556 if (!skb_defer_rx_timestamp(skb
))
5557 list_add_tail(&skb
->list
, &sublist
);
5559 list_splice_init(&sublist
, head
);
5563 if (static_branch_unlikely(&rps_needed
)) {
5564 list_for_each_entry_safe(skb
, next
, head
, list
) {
5565 struct rps_dev_flow voidflow
, *rflow
= &voidflow
;
5566 int cpu
= get_rps_cpu(skb
->dev
, skb
, &rflow
);
5569 /* Will be handled, remove from list */
5570 skb_list_del_init(skb
);
5571 enqueue_to_backlog(skb
, cpu
, &rflow
->last_qtail
);
5576 __netif_receive_skb_list(head
);
5581 * netif_receive_skb - process receive buffer from network
5582 * @skb: buffer to process
5584 * netif_receive_skb() is the main receive data processing function.
5585 * It always succeeds. The buffer may be dropped during processing
5586 * for congestion control or by the protocol layers.
5588 * This function may only be called from softirq context and interrupts
5589 * should be enabled.
5591 * Return values (usually ignored):
5592 * NET_RX_SUCCESS: no congestion
5593 * NET_RX_DROP: packet was dropped
5595 int netif_receive_skb(struct sk_buff
*skb
)
5599 trace_netif_receive_skb_entry(skb
);
5601 ret
= netif_receive_skb_internal(skb
);
5602 trace_netif_receive_skb_exit(ret
);
5606 EXPORT_SYMBOL(netif_receive_skb
);
5609 * netif_receive_skb_list - process many receive buffers from network
5610 * @head: list of skbs to process.
5612 * Since return value of netif_receive_skb() is normally ignored, and
5613 * wouldn't be meaningful for a list, this function returns void.
5615 * This function may only be called from softirq context and interrupts
5616 * should be enabled.
5618 void netif_receive_skb_list(struct list_head
*head
)
5620 struct sk_buff
*skb
;
5622 if (list_empty(head
))
5624 if (trace_netif_receive_skb_list_entry_enabled()) {
5625 list_for_each_entry(skb
, head
, list
)
5626 trace_netif_receive_skb_list_entry(skb
);
5628 netif_receive_skb_list_internal(head
);
5629 trace_netif_receive_skb_list_exit(0);
5631 EXPORT_SYMBOL(netif_receive_skb_list
);
5633 static DEFINE_PER_CPU(struct work_struct
, flush_works
);
5635 /* Network device is going away, flush any packets still pending */
5636 static void flush_backlog(struct work_struct
*work
)
5638 struct sk_buff
*skb
, *tmp
;
5639 struct softnet_data
*sd
;
5642 sd
= this_cpu_ptr(&softnet_data
);
5644 local_irq_disable();
5646 skb_queue_walk_safe(&sd
->input_pkt_queue
, skb
, tmp
) {
5647 if (skb
->dev
->reg_state
== NETREG_UNREGISTERING
) {
5648 __skb_unlink(skb
, &sd
->input_pkt_queue
);
5649 dev_kfree_skb_irq(skb
);
5650 input_queue_head_incr(sd
);
5656 skb_queue_walk_safe(&sd
->process_queue
, skb
, tmp
) {
5657 if (skb
->dev
->reg_state
== NETREG_UNREGISTERING
) {
5658 __skb_unlink(skb
, &sd
->process_queue
);
5660 input_queue_head_incr(sd
);
5666 static bool flush_required(int cpu
)
5668 #if IS_ENABLED(CONFIG_RPS)
5669 struct softnet_data
*sd
= &per_cpu(softnet_data
, cpu
);
5672 local_irq_disable();
5675 /* as insertion into process_queue happens with the rps lock held,
5676 * process_queue access may race only with dequeue
5678 do_flush
= !skb_queue_empty(&sd
->input_pkt_queue
) ||
5679 !skb_queue_empty_lockless(&sd
->process_queue
);
5685 /* without RPS we can't safely check input_pkt_queue: during a
5686 * concurrent remote skb_queue_splice() we can detect as empty both
5687 * input_pkt_queue and process_queue even if the latter could end-up
5688 * containing a lot of packets.
5693 static void flush_all_backlogs(void)
5695 static cpumask_t flush_cpus
;
5698 /* since we are under rtnl lock protection we can use static data
5699 * for the cpumask and avoid allocating on stack the possibly
5706 cpumask_clear(&flush_cpus
);
5707 for_each_online_cpu(cpu
) {
5708 if (flush_required(cpu
)) {
5709 queue_work_on(cpu
, system_highpri_wq
,
5710 per_cpu_ptr(&flush_works
, cpu
));
5711 cpumask_set_cpu(cpu
, &flush_cpus
);
5715 /* we can have in flight packet[s] on the cpus we are not flushing,
5716 * synchronize_net() in rollback_registered_many() will take care of
5719 for_each_cpu(cpu
, &flush_cpus
)
5720 flush_work(per_cpu_ptr(&flush_works
, cpu
));
5725 /* Pass the currently batched GRO_NORMAL SKBs up to the stack. */
5726 static void gro_normal_list(struct napi_struct
*napi
)
5728 if (!napi
->rx_count
)
5730 netif_receive_skb_list_internal(&napi
->rx_list
);
5731 INIT_LIST_HEAD(&napi
->rx_list
);
5735 /* Queue one GRO_NORMAL SKB up for list processing. If batch size exceeded,
5736 * pass the whole batch up to the stack.
5738 static void gro_normal_one(struct napi_struct
*napi
, struct sk_buff
*skb
)
5740 list_add_tail(&skb
->list
, &napi
->rx_list
);
5741 if (++napi
->rx_count
>= gro_normal_batch
)
5742 gro_normal_list(napi
);
5745 INDIRECT_CALLABLE_DECLARE(int inet_gro_complete(struct sk_buff
*, int));
5746 INDIRECT_CALLABLE_DECLARE(int ipv6_gro_complete(struct sk_buff
*, int));
5747 static int napi_gro_complete(struct napi_struct
*napi
, struct sk_buff
*skb
)
5749 struct packet_offload
*ptype
;
5750 __be16 type
= skb
->protocol
;
5751 struct list_head
*head
= &offload_base
;
5754 BUILD_BUG_ON(sizeof(struct napi_gro_cb
) > sizeof(skb
->cb
));
5756 if (NAPI_GRO_CB(skb
)->count
== 1) {
5757 skb_shinfo(skb
)->gso_size
= 0;
5762 list_for_each_entry_rcu(ptype
, head
, list
) {
5763 if (ptype
->type
!= type
|| !ptype
->callbacks
.gro_complete
)
5766 err
= INDIRECT_CALL_INET(ptype
->callbacks
.gro_complete
,
5767 ipv6_gro_complete
, inet_gro_complete
,
5774 WARN_ON(&ptype
->list
== head
);
5776 return NET_RX_SUCCESS
;
5780 gro_normal_one(napi
, skb
);
5781 return NET_RX_SUCCESS
;
5784 static void __napi_gro_flush_chain(struct napi_struct
*napi
, u32 index
,
5787 struct list_head
*head
= &napi
->gro_hash
[index
].list
;
5788 struct sk_buff
*skb
, *p
;
5790 list_for_each_entry_safe_reverse(skb
, p
, head
, list
) {
5791 if (flush_old
&& NAPI_GRO_CB(skb
)->age
== jiffies
)
5793 skb_list_del_init(skb
);
5794 napi_gro_complete(napi
, skb
);
5795 napi
->gro_hash
[index
].count
--;
5798 if (!napi
->gro_hash
[index
].count
)
5799 __clear_bit(index
, &napi
->gro_bitmask
);
5802 /* napi->gro_hash[].list contains packets ordered by age.
5803 * youngest packets at the head of it.
5804 * Complete skbs in reverse order to reduce latencies.
5806 void napi_gro_flush(struct napi_struct
*napi
, bool flush_old
)
5808 unsigned long bitmask
= napi
->gro_bitmask
;
5809 unsigned int i
, base
= ~0U;
5811 while ((i
= ffs(bitmask
)) != 0) {
5814 __napi_gro_flush_chain(napi
, base
, flush_old
);
5817 EXPORT_SYMBOL(napi_gro_flush
);
5819 static struct list_head
*gro_list_prepare(struct napi_struct
*napi
,
5820 struct sk_buff
*skb
)
5822 unsigned int maclen
= skb
->dev
->hard_header_len
;
5823 u32 hash
= skb_get_hash_raw(skb
);
5824 struct list_head
*head
;
5827 head
= &napi
->gro_hash
[hash
& (GRO_HASH_BUCKETS
- 1)].list
;
5828 list_for_each_entry(p
, head
, list
) {
5829 unsigned long diffs
;
5831 NAPI_GRO_CB(p
)->flush
= 0;
5833 if (hash
!= skb_get_hash_raw(p
)) {
5834 NAPI_GRO_CB(p
)->same_flow
= 0;
5838 diffs
= (unsigned long)p
->dev
^ (unsigned long)skb
->dev
;
5839 diffs
|= skb_vlan_tag_present(p
) ^ skb_vlan_tag_present(skb
);
5840 if (skb_vlan_tag_present(p
))
5841 diffs
|= skb_vlan_tag_get(p
) ^ skb_vlan_tag_get(skb
);
5842 diffs
|= skb_metadata_dst_cmp(p
, skb
);
5843 diffs
|= skb_metadata_differs(p
, skb
);
5844 if (maclen
== ETH_HLEN
)
5845 diffs
|= compare_ether_header(skb_mac_header(p
),
5846 skb_mac_header(skb
));
5848 diffs
= memcmp(skb_mac_header(p
),
5849 skb_mac_header(skb
),
5851 NAPI_GRO_CB(p
)->same_flow
= !diffs
;
5857 static void skb_gro_reset_offset(struct sk_buff
*skb
)
5859 const struct skb_shared_info
*pinfo
= skb_shinfo(skb
);
5860 const skb_frag_t
*frag0
= &pinfo
->frags
[0];
5862 NAPI_GRO_CB(skb
)->data_offset
= 0;
5863 NAPI_GRO_CB(skb
)->frag0
= NULL
;
5864 NAPI_GRO_CB(skb
)->frag0_len
= 0;
5866 if (!skb_headlen(skb
) && pinfo
->nr_frags
&&
5867 !PageHighMem(skb_frag_page(frag0
))) {
5868 NAPI_GRO_CB(skb
)->frag0
= skb_frag_address(frag0
);
5869 NAPI_GRO_CB(skb
)->frag0_len
= min_t(unsigned int,
5870 skb_frag_size(frag0
),
5871 skb
->end
- skb
->tail
);
5875 static void gro_pull_from_frag0(struct sk_buff
*skb
, int grow
)
5877 struct skb_shared_info
*pinfo
= skb_shinfo(skb
);
5879 BUG_ON(skb
->end
- skb
->tail
< grow
);
5881 memcpy(skb_tail_pointer(skb
), NAPI_GRO_CB(skb
)->frag0
, grow
);
5883 skb
->data_len
-= grow
;
5886 skb_frag_off_add(&pinfo
->frags
[0], grow
);
5887 skb_frag_size_sub(&pinfo
->frags
[0], grow
);
5889 if (unlikely(!skb_frag_size(&pinfo
->frags
[0]))) {
5890 skb_frag_unref(skb
, 0);
5891 memmove(pinfo
->frags
, pinfo
->frags
+ 1,
5892 --pinfo
->nr_frags
* sizeof(pinfo
->frags
[0]));
5896 static void gro_flush_oldest(struct napi_struct
*napi
, struct list_head
*head
)
5898 struct sk_buff
*oldest
;
5900 oldest
= list_last_entry(head
, struct sk_buff
, list
);
5902 /* We are called with head length >= MAX_GRO_SKBS, so this is
5905 if (WARN_ON_ONCE(!oldest
))
5908 /* Do not adjust napi->gro_hash[].count, caller is adding a new
5911 skb_list_del_init(oldest
);
5912 napi_gro_complete(napi
, oldest
);
5915 INDIRECT_CALLABLE_DECLARE(struct sk_buff
*inet_gro_receive(struct list_head
*,
5917 INDIRECT_CALLABLE_DECLARE(struct sk_buff
*ipv6_gro_receive(struct list_head
*,
5919 static enum gro_result
dev_gro_receive(struct napi_struct
*napi
, struct sk_buff
*skb
)
5921 u32 hash
= skb_get_hash_raw(skb
) & (GRO_HASH_BUCKETS
- 1);
5922 struct list_head
*head
= &offload_base
;
5923 struct packet_offload
*ptype
;
5924 __be16 type
= skb
->protocol
;
5925 struct list_head
*gro_head
;
5926 struct sk_buff
*pp
= NULL
;
5927 enum gro_result ret
;
5931 if (netif_elide_gro(skb
->dev
))
5934 gro_head
= gro_list_prepare(napi
, skb
);
5937 list_for_each_entry_rcu(ptype
, head
, list
) {
5938 if (ptype
->type
!= type
|| !ptype
->callbacks
.gro_receive
)
5941 skb_set_network_header(skb
, skb_gro_offset(skb
));
5942 skb_reset_mac_len(skb
);
5943 NAPI_GRO_CB(skb
)->same_flow
= 0;
5944 NAPI_GRO_CB(skb
)->flush
= skb_is_gso(skb
) || skb_has_frag_list(skb
);
5945 NAPI_GRO_CB(skb
)->free
= 0;
5946 NAPI_GRO_CB(skb
)->encap_mark
= 0;
5947 NAPI_GRO_CB(skb
)->recursion_counter
= 0;
5948 NAPI_GRO_CB(skb
)->is_fou
= 0;
5949 NAPI_GRO_CB(skb
)->is_atomic
= 1;
5950 NAPI_GRO_CB(skb
)->gro_remcsum_start
= 0;
5952 /* Setup for GRO checksum validation */
5953 switch (skb
->ip_summed
) {
5954 case CHECKSUM_COMPLETE
:
5955 NAPI_GRO_CB(skb
)->csum
= skb
->csum
;
5956 NAPI_GRO_CB(skb
)->csum_valid
= 1;
5957 NAPI_GRO_CB(skb
)->csum_cnt
= 0;
5959 case CHECKSUM_UNNECESSARY
:
5960 NAPI_GRO_CB(skb
)->csum_cnt
= skb
->csum_level
+ 1;
5961 NAPI_GRO_CB(skb
)->csum_valid
= 0;
5964 NAPI_GRO_CB(skb
)->csum_cnt
= 0;
5965 NAPI_GRO_CB(skb
)->csum_valid
= 0;
5968 pp
= INDIRECT_CALL_INET(ptype
->callbacks
.gro_receive
,
5969 ipv6_gro_receive
, inet_gro_receive
,
5975 if (&ptype
->list
== head
)
5978 if (PTR_ERR(pp
) == -EINPROGRESS
) {
5983 same_flow
= NAPI_GRO_CB(skb
)->same_flow
;
5984 ret
= NAPI_GRO_CB(skb
)->free
? GRO_MERGED_FREE
: GRO_MERGED
;
5987 skb_list_del_init(pp
);
5988 napi_gro_complete(napi
, pp
);
5989 napi
->gro_hash
[hash
].count
--;
5995 if (NAPI_GRO_CB(skb
)->flush
)
5998 if (unlikely(napi
->gro_hash
[hash
].count
>= MAX_GRO_SKBS
)) {
5999 gro_flush_oldest(napi
, gro_head
);
6001 napi
->gro_hash
[hash
].count
++;
6003 NAPI_GRO_CB(skb
)->count
= 1;
6004 NAPI_GRO_CB(skb
)->age
= jiffies
;
6005 NAPI_GRO_CB(skb
)->last
= skb
;
6006 skb_shinfo(skb
)->gso_size
= skb_gro_len(skb
);
6007 list_add(&skb
->list
, gro_head
);
6011 grow
= skb_gro_offset(skb
) - skb_headlen(skb
);
6013 gro_pull_from_frag0(skb
, grow
);
6015 if (napi
->gro_hash
[hash
].count
) {
6016 if (!test_bit(hash
, &napi
->gro_bitmask
))
6017 __set_bit(hash
, &napi
->gro_bitmask
);
6018 } else if (test_bit(hash
, &napi
->gro_bitmask
)) {
6019 __clear_bit(hash
, &napi
->gro_bitmask
);
6029 struct packet_offload
*gro_find_receive_by_type(__be16 type
)
6031 struct list_head
*offload_head
= &offload_base
;
6032 struct packet_offload
*ptype
;
6034 list_for_each_entry_rcu(ptype
, offload_head
, list
) {
6035 if (ptype
->type
!= type
|| !ptype
->callbacks
.gro_receive
)
6041 EXPORT_SYMBOL(gro_find_receive_by_type
);
6043 struct packet_offload
*gro_find_complete_by_type(__be16 type
)
6045 struct list_head
*offload_head
= &offload_base
;
6046 struct packet_offload
*ptype
;
6048 list_for_each_entry_rcu(ptype
, offload_head
, list
) {
6049 if (ptype
->type
!= type
|| !ptype
->callbacks
.gro_complete
)
6055 EXPORT_SYMBOL(gro_find_complete_by_type
);
6057 static void napi_skb_free_stolen_head(struct sk_buff
*skb
)
6061 kmem_cache_free(skbuff_head_cache
, skb
);
6064 static gro_result_t
napi_skb_finish(struct napi_struct
*napi
,
6065 struct sk_buff
*skb
,
6070 gro_normal_one(napi
, skb
);
6077 case GRO_MERGED_FREE
:
6078 if (NAPI_GRO_CB(skb
)->free
== NAPI_GRO_FREE_STOLEN_HEAD
)
6079 napi_skb_free_stolen_head(skb
);
6093 gro_result_t
napi_gro_receive(struct napi_struct
*napi
, struct sk_buff
*skb
)
6097 skb_mark_napi_id(skb
, napi
);
6098 trace_napi_gro_receive_entry(skb
);
6100 skb_gro_reset_offset(skb
);
6102 ret
= napi_skb_finish(napi
, skb
, dev_gro_receive(napi
, skb
));
6103 trace_napi_gro_receive_exit(ret
);
6107 EXPORT_SYMBOL(napi_gro_receive
);
6109 static void napi_reuse_skb(struct napi_struct
*napi
, struct sk_buff
*skb
)
6111 if (unlikely(skb
->pfmemalloc
)) {
6115 __skb_pull(skb
, skb_headlen(skb
));
6116 /* restore the reserve we had after netdev_alloc_skb_ip_align() */
6117 skb_reserve(skb
, NET_SKB_PAD
+ NET_IP_ALIGN
- skb_headroom(skb
));
6118 __vlan_hwaccel_clear_tag(skb
);
6119 skb
->dev
= napi
->dev
;
6122 /* eth_type_trans() assumes pkt_type is PACKET_HOST */
6123 skb
->pkt_type
= PACKET_HOST
;
6125 skb
->encapsulation
= 0;
6126 skb_shinfo(skb
)->gso_type
= 0;
6127 skb
->truesize
= SKB_TRUESIZE(skb_end_offset(skb
));
6133 struct sk_buff
*napi_get_frags(struct napi_struct
*napi
)
6135 struct sk_buff
*skb
= napi
->skb
;
6138 skb
= napi_alloc_skb(napi
, GRO_MAX_HEAD
);
6141 skb_mark_napi_id(skb
, napi
);
6146 EXPORT_SYMBOL(napi_get_frags
);
6148 static gro_result_t
napi_frags_finish(struct napi_struct
*napi
,
6149 struct sk_buff
*skb
,
6155 __skb_push(skb
, ETH_HLEN
);
6156 skb
->protocol
= eth_type_trans(skb
, skb
->dev
);
6157 if (ret
== GRO_NORMAL
)
6158 gro_normal_one(napi
, skb
);
6162 napi_reuse_skb(napi
, skb
);
6165 case GRO_MERGED_FREE
:
6166 if (NAPI_GRO_CB(skb
)->free
== NAPI_GRO_FREE_STOLEN_HEAD
)
6167 napi_skb_free_stolen_head(skb
);
6169 napi_reuse_skb(napi
, skb
);
6180 /* Upper GRO stack assumes network header starts at gro_offset=0
6181 * Drivers could call both napi_gro_frags() and napi_gro_receive()
6182 * We copy ethernet header into skb->data to have a common layout.
6184 static struct sk_buff
*napi_frags_skb(struct napi_struct
*napi
)
6186 struct sk_buff
*skb
= napi
->skb
;
6187 const struct ethhdr
*eth
;
6188 unsigned int hlen
= sizeof(*eth
);
6192 skb_reset_mac_header(skb
);
6193 skb_gro_reset_offset(skb
);
6195 if (unlikely(skb_gro_header_hard(skb
, hlen
))) {
6196 eth
= skb_gro_header_slow(skb
, hlen
, 0);
6197 if (unlikely(!eth
)) {
6198 net_warn_ratelimited("%s: dropping impossible skb from %s\n",
6199 __func__
, napi
->dev
->name
);
6200 napi_reuse_skb(napi
, skb
);
6204 eth
= (const struct ethhdr
*)skb
->data
;
6205 gro_pull_from_frag0(skb
, hlen
);
6206 NAPI_GRO_CB(skb
)->frag0
+= hlen
;
6207 NAPI_GRO_CB(skb
)->frag0_len
-= hlen
;
6209 __skb_pull(skb
, hlen
);
6212 * This works because the only protocols we care about don't require
6214 * We'll fix it up properly in napi_frags_finish()
6216 skb
->protocol
= eth
->h_proto
;
6221 gro_result_t
napi_gro_frags(struct napi_struct
*napi
)
6224 struct sk_buff
*skb
= napi_frags_skb(napi
);
6229 trace_napi_gro_frags_entry(skb
);
6231 ret
= napi_frags_finish(napi
, skb
, dev_gro_receive(napi
, skb
));
6232 trace_napi_gro_frags_exit(ret
);
6236 EXPORT_SYMBOL(napi_gro_frags
);
6238 /* Compute the checksum from gro_offset and return the folded value
6239 * after adding in any pseudo checksum.
6241 __sum16
__skb_gro_checksum_complete(struct sk_buff
*skb
)
6246 wsum
= skb_checksum(skb
, skb_gro_offset(skb
), skb_gro_len(skb
), 0);
6248 /* NAPI_GRO_CB(skb)->csum holds pseudo checksum */
6249 sum
= csum_fold(csum_add(NAPI_GRO_CB(skb
)->csum
, wsum
));
6250 /* See comments in __skb_checksum_complete(). */
6252 if (unlikely(skb
->ip_summed
== CHECKSUM_COMPLETE
) &&
6253 !skb
->csum_complete_sw
)
6254 netdev_rx_csum_fault(skb
->dev
, skb
);
6257 NAPI_GRO_CB(skb
)->csum
= wsum
;
6258 NAPI_GRO_CB(skb
)->csum_valid
= 1;
6262 EXPORT_SYMBOL(__skb_gro_checksum_complete
);
6264 static void net_rps_send_ipi(struct softnet_data
*remsd
)
6268 struct softnet_data
*next
= remsd
->rps_ipi_next
;
6270 if (cpu_online(remsd
->cpu
))
6271 smp_call_function_single_async(remsd
->cpu
, &remsd
->csd
);
6278 * net_rps_action_and_irq_enable sends any pending IPI's for rps.
6279 * Note: called with local irq disabled, but exits with local irq enabled.
6281 static void net_rps_action_and_irq_enable(struct softnet_data
*sd
)
6284 struct softnet_data
*remsd
= sd
->rps_ipi_list
;
6287 sd
->rps_ipi_list
= NULL
;
6291 /* Send pending IPI's to kick RPS processing on remote cpus. */
6292 net_rps_send_ipi(remsd
);
6298 static bool sd_has_rps_ipi_waiting(struct softnet_data
*sd
)
6301 return sd
->rps_ipi_list
!= NULL
;
6307 static int process_backlog(struct napi_struct
*napi
, int quota
)
6309 struct softnet_data
*sd
= container_of(napi
, struct softnet_data
, backlog
);
6313 /* Check if we have pending ipi, its better to send them now,
6314 * not waiting net_rx_action() end.
6316 if (sd_has_rps_ipi_waiting(sd
)) {
6317 local_irq_disable();
6318 net_rps_action_and_irq_enable(sd
);
6321 napi
->weight
= dev_rx_weight
;
6323 struct sk_buff
*skb
;
6325 while ((skb
= __skb_dequeue(&sd
->process_queue
))) {
6327 __netif_receive_skb(skb
);
6329 input_queue_head_incr(sd
);
6330 if (++work
>= quota
)
6335 local_irq_disable();
6337 if (skb_queue_empty(&sd
->input_pkt_queue
)) {
6339 * Inline a custom version of __napi_complete().
6340 * only current cpu owns and manipulates this napi,
6341 * and NAPI_STATE_SCHED is the only possible flag set
6343 * We can use a plain write instead of clear_bit(),
6344 * and we dont need an smp_mb() memory barrier.
6349 skb_queue_splice_tail_init(&sd
->input_pkt_queue
,
6350 &sd
->process_queue
);
6360 * __napi_schedule - schedule for receive
6361 * @n: entry to schedule
6363 * The entry's receive function will be scheduled to run.
6364 * Consider using __napi_schedule_irqoff() if hard irqs are masked.
6366 void __napi_schedule(struct napi_struct
*n
)
6368 unsigned long flags
;
6370 local_irq_save(flags
);
6371 ____napi_schedule(this_cpu_ptr(&softnet_data
), n
);
6372 local_irq_restore(flags
);
6374 EXPORT_SYMBOL(__napi_schedule
);
6377 * napi_schedule_prep - check if napi can be scheduled
6380 * Test if NAPI routine is already running, and if not mark
6381 * it as running. This is used as a condition variable to
6382 * insure only one NAPI poll instance runs. We also make
6383 * sure there is no pending NAPI disable.
6385 bool napi_schedule_prep(struct napi_struct
*n
)
6387 unsigned long val
, new;
6390 val
= READ_ONCE(n
->state
);
6391 if (unlikely(val
& NAPIF_STATE_DISABLE
))
6393 new = val
| NAPIF_STATE_SCHED
;
6395 /* Sets STATE_MISSED bit if STATE_SCHED was already set
6396 * This was suggested by Alexander Duyck, as compiler
6397 * emits better code than :
6398 * if (val & NAPIF_STATE_SCHED)
6399 * new |= NAPIF_STATE_MISSED;
6401 new |= (val
& NAPIF_STATE_SCHED
) / NAPIF_STATE_SCHED
*
6403 } while (cmpxchg(&n
->state
, val
, new) != val
);
6405 return !(val
& NAPIF_STATE_SCHED
);
6407 EXPORT_SYMBOL(napi_schedule_prep
);
6410 * __napi_schedule_irqoff - schedule for receive
6411 * @n: entry to schedule
6413 * Variant of __napi_schedule() assuming hard irqs are masked
6415 void __napi_schedule_irqoff(struct napi_struct
*n
)
6417 ____napi_schedule(this_cpu_ptr(&softnet_data
), n
);
6419 EXPORT_SYMBOL(__napi_schedule_irqoff
);
6421 bool napi_complete_done(struct napi_struct
*n
, int work_done
)
6423 unsigned long flags
, val
, new, timeout
= 0;
6427 * 1) Don't let napi dequeue from the cpu poll list
6428 * just in case its running on a different cpu.
6429 * 2) If we are busy polling, do nothing here, we have
6430 * the guarantee we will be called later.
6432 if (unlikely(n
->state
& (NAPIF_STATE_NPSVC
|
6433 NAPIF_STATE_IN_BUSY_POLL
)))
6438 timeout
= READ_ONCE(n
->dev
->gro_flush_timeout
);
6439 n
->defer_hard_irqs_count
= READ_ONCE(n
->dev
->napi_defer_hard_irqs
);
6441 if (n
->defer_hard_irqs_count
> 0) {
6442 n
->defer_hard_irqs_count
--;
6443 timeout
= READ_ONCE(n
->dev
->gro_flush_timeout
);
6447 if (n
->gro_bitmask
) {
6448 /* When the NAPI instance uses a timeout and keeps postponing
6449 * it, we need to bound somehow the time packets are kept in
6452 napi_gro_flush(n
, !!timeout
);
6457 if (unlikely(!list_empty(&n
->poll_list
))) {
6458 /* If n->poll_list is not empty, we need to mask irqs */
6459 local_irq_save(flags
);
6460 list_del_init(&n
->poll_list
);
6461 local_irq_restore(flags
);
6465 val
= READ_ONCE(n
->state
);
6467 WARN_ON_ONCE(!(val
& NAPIF_STATE_SCHED
));
6469 new = val
& ~(NAPIF_STATE_MISSED
| NAPIF_STATE_SCHED
|
6470 NAPIF_STATE_PREFER_BUSY_POLL
);
6472 /* If STATE_MISSED was set, leave STATE_SCHED set,
6473 * because we will call napi->poll() one more time.
6474 * This C code was suggested by Alexander Duyck to help gcc.
6476 new |= (val
& NAPIF_STATE_MISSED
) / NAPIF_STATE_MISSED
*
6478 } while (cmpxchg(&n
->state
, val
, new) != val
);
6480 if (unlikely(val
& NAPIF_STATE_MISSED
)) {
6486 hrtimer_start(&n
->timer
, ns_to_ktime(timeout
),
6487 HRTIMER_MODE_REL_PINNED
);
6490 EXPORT_SYMBOL(napi_complete_done
);
6492 /* must be called under rcu_read_lock(), as we dont take a reference */
6493 static struct napi_struct
*napi_by_id(unsigned int napi_id
)
6495 unsigned int hash
= napi_id
% HASH_SIZE(napi_hash
);
6496 struct napi_struct
*napi
;
6498 hlist_for_each_entry_rcu(napi
, &napi_hash
[hash
], napi_hash_node
)
6499 if (napi
->napi_id
== napi_id
)
6505 #if defined(CONFIG_NET_RX_BUSY_POLL)
6507 static void __busy_poll_stop(struct napi_struct
*napi
, bool skip_schedule
)
6509 if (!skip_schedule
) {
6510 gro_normal_list(napi
);
6511 __napi_schedule(napi
);
6515 if (napi
->gro_bitmask
) {
6516 /* flush too old packets
6517 * If HZ < 1000, flush all packets.
6519 napi_gro_flush(napi
, HZ
>= 1000);
6522 gro_normal_list(napi
);
6523 clear_bit(NAPI_STATE_SCHED
, &napi
->state
);
6526 static void busy_poll_stop(struct napi_struct
*napi
, void *have_poll_lock
, bool prefer_busy_poll
,
6529 bool skip_schedule
= false;
6530 unsigned long timeout
;
6533 /* Busy polling means there is a high chance device driver hard irq
6534 * could not grab NAPI_STATE_SCHED, and that NAPI_STATE_MISSED was
6535 * set in napi_schedule_prep().
6536 * Since we are about to call napi->poll() once more, we can safely
6537 * clear NAPI_STATE_MISSED.
6539 * Note: x86 could use a single "lock and ..." instruction
6540 * to perform these two clear_bit()
6542 clear_bit(NAPI_STATE_MISSED
, &napi
->state
);
6543 clear_bit(NAPI_STATE_IN_BUSY_POLL
, &napi
->state
);
6547 if (prefer_busy_poll
) {
6548 napi
->defer_hard_irqs_count
= READ_ONCE(napi
->dev
->napi_defer_hard_irqs
);
6549 timeout
= READ_ONCE(napi
->dev
->gro_flush_timeout
);
6550 if (napi
->defer_hard_irqs_count
&& timeout
) {
6551 hrtimer_start(&napi
->timer
, ns_to_ktime(timeout
), HRTIMER_MODE_REL_PINNED
);
6552 skip_schedule
= true;
6556 /* All we really want here is to re-enable device interrupts.
6557 * Ideally, a new ndo_busy_poll_stop() could avoid another round.
6559 rc
= napi
->poll(napi
, budget
);
6560 /* We can't gro_normal_list() here, because napi->poll() might have
6561 * rearmed the napi (napi_complete_done()) in which case it could
6562 * already be running on another CPU.
6564 trace_napi_poll(napi
, rc
, budget
);
6565 netpoll_poll_unlock(have_poll_lock
);
6567 __busy_poll_stop(napi
, skip_schedule
);
6571 void napi_busy_loop(unsigned int napi_id
,
6572 bool (*loop_end
)(void *, unsigned long),
6573 void *loop_end_arg
, bool prefer_busy_poll
, u16 budget
)
6575 unsigned long start_time
= loop_end
? busy_loop_current_time() : 0;
6576 int (*napi_poll
)(struct napi_struct
*napi
, int budget
);
6577 void *have_poll_lock
= NULL
;
6578 struct napi_struct
*napi
;
6585 napi
= napi_by_id(napi_id
);
6595 unsigned long val
= READ_ONCE(napi
->state
);
6597 /* If multiple threads are competing for this napi,
6598 * we avoid dirtying napi->state as much as we can.
6600 if (val
& (NAPIF_STATE_DISABLE
| NAPIF_STATE_SCHED
|
6601 NAPIF_STATE_IN_BUSY_POLL
)) {
6602 if (prefer_busy_poll
)
6603 set_bit(NAPI_STATE_PREFER_BUSY_POLL
, &napi
->state
);
6606 if (cmpxchg(&napi
->state
, val
,
6607 val
| NAPIF_STATE_IN_BUSY_POLL
|
6608 NAPIF_STATE_SCHED
) != val
) {
6609 if (prefer_busy_poll
)
6610 set_bit(NAPI_STATE_PREFER_BUSY_POLL
, &napi
->state
);
6613 have_poll_lock
= netpoll_poll_lock(napi
);
6614 napi_poll
= napi
->poll
;
6616 work
= napi_poll(napi
, budget
);
6617 trace_napi_poll(napi
, work
, budget
);
6618 gro_normal_list(napi
);
6621 __NET_ADD_STATS(dev_net(napi
->dev
),
6622 LINUX_MIB_BUSYPOLLRXPACKETS
, work
);
6625 if (!loop_end
|| loop_end(loop_end_arg
, start_time
))
6628 if (unlikely(need_resched())) {
6630 busy_poll_stop(napi
, have_poll_lock
, prefer_busy_poll
, budget
);
6634 if (loop_end(loop_end_arg
, start_time
))
6641 busy_poll_stop(napi
, have_poll_lock
, prefer_busy_poll
, budget
);
6646 EXPORT_SYMBOL(napi_busy_loop
);
6648 #endif /* CONFIG_NET_RX_BUSY_POLL */
6650 static void napi_hash_add(struct napi_struct
*napi
)
6652 if (test_bit(NAPI_STATE_NO_BUSY_POLL
, &napi
->state
))
6655 spin_lock(&napi_hash_lock
);
6657 /* 0..NR_CPUS range is reserved for sender_cpu use */
6659 if (unlikely(++napi_gen_id
< MIN_NAPI_ID
))
6660 napi_gen_id
= MIN_NAPI_ID
;
6661 } while (napi_by_id(napi_gen_id
));
6662 napi
->napi_id
= napi_gen_id
;
6664 hlist_add_head_rcu(&napi
->napi_hash_node
,
6665 &napi_hash
[napi
->napi_id
% HASH_SIZE(napi_hash
)]);
6667 spin_unlock(&napi_hash_lock
);
6670 /* Warning : caller is responsible to make sure rcu grace period
6671 * is respected before freeing memory containing @napi
6673 static void napi_hash_del(struct napi_struct
*napi
)
6675 spin_lock(&napi_hash_lock
);
6677 hlist_del_init_rcu(&napi
->napi_hash_node
);
6679 spin_unlock(&napi_hash_lock
);
6682 static enum hrtimer_restart
napi_watchdog(struct hrtimer
*timer
)
6684 struct napi_struct
*napi
;
6686 napi
= container_of(timer
, struct napi_struct
, timer
);
6688 /* Note : we use a relaxed variant of napi_schedule_prep() not setting
6689 * NAPI_STATE_MISSED, since we do not react to a device IRQ.
6691 if (!napi_disable_pending(napi
) &&
6692 !test_and_set_bit(NAPI_STATE_SCHED
, &napi
->state
)) {
6693 clear_bit(NAPI_STATE_PREFER_BUSY_POLL
, &napi
->state
);
6694 __napi_schedule_irqoff(napi
);
6697 return HRTIMER_NORESTART
;
6700 static void init_gro_hash(struct napi_struct
*napi
)
6704 for (i
= 0; i
< GRO_HASH_BUCKETS
; i
++) {
6705 INIT_LIST_HEAD(&napi
->gro_hash
[i
].list
);
6706 napi
->gro_hash
[i
].count
= 0;
6708 napi
->gro_bitmask
= 0;
6711 void netif_napi_add(struct net_device
*dev
, struct napi_struct
*napi
,
6712 int (*poll
)(struct napi_struct
*, int), int weight
)
6714 if (WARN_ON(test_and_set_bit(NAPI_STATE_LISTED
, &napi
->state
)))
6717 INIT_LIST_HEAD(&napi
->poll_list
);
6718 INIT_HLIST_NODE(&napi
->napi_hash_node
);
6719 hrtimer_init(&napi
->timer
, CLOCK_MONOTONIC
, HRTIMER_MODE_REL_PINNED
);
6720 napi
->timer
.function
= napi_watchdog
;
6721 init_gro_hash(napi
);
6723 INIT_LIST_HEAD(&napi
->rx_list
);
6726 if (weight
> NAPI_POLL_WEIGHT
)
6727 netdev_err_once(dev
, "%s() called with weight %d\n", __func__
,
6729 napi
->weight
= weight
;
6731 #ifdef CONFIG_NETPOLL
6732 napi
->poll_owner
= -1;
6734 set_bit(NAPI_STATE_SCHED
, &napi
->state
);
6735 set_bit(NAPI_STATE_NPSVC
, &napi
->state
);
6736 list_add_rcu(&napi
->dev_list
, &dev
->napi_list
);
6737 napi_hash_add(napi
);
6739 EXPORT_SYMBOL(netif_napi_add
);
6741 void napi_disable(struct napi_struct
*n
)
6744 set_bit(NAPI_STATE_DISABLE
, &n
->state
);
6746 while (test_and_set_bit(NAPI_STATE_SCHED
, &n
->state
))
6748 while (test_and_set_bit(NAPI_STATE_NPSVC
, &n
->state
))
6751 hrtimer_cancel(&n
->timer
);
6753 clear_bit(NAPI_STATE_PREFER_BUSY_POLL
, &n
->state
);
6754 clear_bit(NAPI_STATE_DISABLE
, &n
->state
);
6756 EXPORT_SYMBOL(napi_disable
);
6758 static void flush_gro_hash(struct napi_struct
*napi
)
6762 for (i
= 0; i
< GRO_HASH_BUCKETS
; i
++) {
6763 struct sk_buff
*skb
, *n
;
6765 list_for_each_entry_safe(skb
, n
, &napi
->gro_hash
[i
].list
, list
)
6767 napi
->gro_hash
[i
].count
= 0;
6771 /* Must be called in process context */
6772 void __netif_napi_del(struct napi_struct
*napi
)
6774 if (!test_and_clear_bit(NAPI_STATE_LISTED
, &napi
->state
))
6777 napi_hash_del(napi
);
6778 list_del_rcu(&napi
->dev_list
);
6779 napi_free_frags(napi
);
6781 flush_gro_hash(napi
);
6782 napi
->gro_bitmask
= 0;
6784 EXPORT_SYMBOL(__netif_napi_del
);
6786 static int napi_poll(struct napi_struct
*n
, struct list_head
*repoll
)
6791 list_del_init(&n
->poll_list
);
6793 have
= netpoll_poll_lock(n
);
6797 /* This NAPI_STATE_SCHED test is for avoiding a race
6798 * with netpoll's poll_napi(). Only the entity which
6799 * obtains the lock and sees NAPI_STATE_SCHED set will
6800 * actually make the ->poll() call. Therefore we avoid
6801 * accidentally calling ->poll() when NAPI is not scheduled.
6804 if (test_bit(NAPI_STATE_SCHED
, &n
->state
)) {
6805 work
= n
->poll(n
, weight
);
6806 trace_napi_poll(n
, work
, weight
);
6809 if (unlikely(work
> weight
))
6810 pr_err_once("NAPI poll function %pS returned %d, exceeding its budget of %d.\n",
6811 n
->poll
, work
, weight
);
6813 if (likely(work
< weight
))
6816 /* Drivers must not modify the NAPI state if they
6817 * consume the entire weight. In such cases this code
6818 * still "owns" the NAPI instance and therefore can
6819 * move the instance around on the list at-will.
6821 if (unlikely(napi_disable_pending(n
))) {
6826 /* The NAPI context has more processing work, but busy-polling
6827 * is preferred. Exit early.
6829 if (napi_prefer_busy_poll(n
)) {
6830 if (napi_complete_done(n
, work
)) {
6831 /* If timeout is not set, we need to make sure
6832 * that the NAPI is re-scheduled.
6839 if (n
->gro_bitmask
) {
6840 /* flush too old packets
6841 * If HZ < 1000, flush all packets.
6843 napi_gro_flush(n
, HZ
>= 1000);
6848 /* Some drivers may have called napi_schedule
6849 * prior to exhausting their budget.
6851 if (unlikely(!list_empty(&n
->poll_list
))) {
6852 pr_warn_once("%s: Budget exhausted after napi rescheduled\n",
6853 n
->dev
? n
->dev
->name
: "backlog");
6857 list_add_tail(&n
->poll_list
, repoll
);
6860 netpoll_poll_unlock(have
);
6865 static __latent_entropy
void net_rx_action(struct softirq_action
*h
)
6867 struct softnet_data
*sd
= this_cpu_ptr(&softnet_data
);
6868 unsigned long time_limit
= jiffies
+
6869 usecs_to_jiffies(netdev_budget_usecs
);
6870 int budget
= netdev_budget
;
6874 local_irq_disable();
6875 list_splice_init(&sd
->poll_list
, &list
);
6879 struct napi_struct
*n
;
6881 if (list_empty(&list
)) {
6882 if (!sd_has_rps_ipi_waiting(sd
) && list_empty(&repoll
))
6887 n
= list_first_entry(&list
, struct napi_struct
, poll_list
);
6888 budget
-= napi_poll(n
, &repoll
);
6890 /* If softirq window is exhausted then punt.
6891 * Allow this to run for 2 jiffies since which will allow
6892 * an average latency of 1.5/HZ.
6894 if (unlikely(budget
<= 0 ||
6895 time_after_eq(jiffies
, time_limit
))) {
6901 local_irq_disable();
6903 list_splice_tail_init(&sd
->poll_list
, &list
);
6904 list_splice_tail(&repoll
, &list
);
6905 list_splice(&list
, &sd
->poll_list
);
6906 if (!list_empty(&sd
->poll_list
))
6907 __raise_softirq_irqoff(NET_RX_SOFTIRQ
);
6909 net_rps_action_and_irq_enable(sd
);
6911 __kfree_skb_flush();
6914 struct netdev_adjacent
{
6915 struct net_device
*dev
;
6917 /* upper master flag, there can only be one master device per list */
6920 /* lookup ignore flag */
6923 /* counter for the number of times this device was added to us */
6926 /* private field for the users */
6929 struct list_head list
;
6930 struct rcu_head rcu
;
6933 static struct netdev_adjacent
*__netdev_find_adj(struct net_device
*adj_dev
,
6934 struct list_head
*adj_list
)
6936 struct netdev_adjacent
*adj
;
6938 list_for_each_entry(adj
, adj_list
, list
) {
6939 if (adj
->dev
== adj_dev
)
6945 static int ____netdev_has_upper_dev(struct net_device
*upper_dev
,
6946 struct netdev_nested_priv
*priv
)
6948 struct net_device
*dev
= (struct net_device
*)priv
->data
;
6950 return upper_dev
== dev
;
6954 * netdev_has_upper_dev - Check if device is linked to an upper device
6956 * @upper_dev: upper device to check
6958 * Find out if a device is linked to specified upper device and return true
6959 * in case it is. Note that this checks only immediate upper device,
6960 * not through a complete stack of devices. The caller must hold the RTNL lock.
6962 bool netdev_has_upper_dev(struct net_device
*dev
,
6963 struct net_device
*upper_dev
)
6965 struct netdev_nested_priv priv
= {
6966 .data
= (void *)upper_dev
,
6971 return netdev_walk_all_upper_dev_rcu(dev
, ____netdev_has_upper_dev
,
6974 EXPORT_SYMBOL(netdev_has_upper_dev
);
6977 * netdev_has_upper_dev_all_rcu - Check if device is linked to an upper device
6979 * @upper_dev: upper device to check
6981 * Find out if a device is linked to specified upper device and return true
6982 * in case it is. Note that this checks the entire upper device chain.
6983 * The caller must hold rcu lock.
6986 bool netdev_has_upper_dev_all_rcu(struct net_device
*dev
,
6987 struct net_device
*upper_dev
)
6989 struct netdev_nested_priv priv
= {
6990 .data
= (void *)upper_dev
,
6993 return !!netdev_walk_all_upper_dev_rcu(dev
, ____netdev_has_upper_dev
,
6996 EXPORT_SYMBOL(netdev_has_upper_dev_all_rcu
);
6999 * netdev_has_any_upper_dev - Check if device is linked to some device
7002 * Find out if a device is linked to an upper device and return true in case
7003 * it is. The caller must hold the RTNL lock.
7005 bool netdev_has_any_upper_dev(struct net_device
*dev
)
7009 return !list_empty(&dev
->adj_list
.upper
);
7011 EXPORT_SYMBOL(netdev_has_any_upper_dev
);
7014 * netdev_master_upper_dev_get - Get master upper device
7017 * Find a master upper device and return pointer to it or NULL in case
7018 * it's not there. The caller must hold the RTNL lock.
7020 struct net_device
*netdev_master_upper_dev_get(struct net_device
*dev
)
7022 struct netdev_adjacent
*upper
;
7026 if (list_empty(&dev
->adj_list
.upper
))
7029 upper
= list_first_entry(&dev
->adj_list
.upper
,
7030 struct netdev_adjacent
, list
);
7031 if (likely(upper
->master
))
7035 EXPORT_SYMBOL(netdev_master_upper_dev_get
);
7037 static struct net_device
*__netdev_master_upper_dev_get(struct net_device
*dev
)
7039 struct netdev_adjacent
*upper
;
7043 if (list_empty(&dev
->adj_list
.upper
))
7046 upper
= list_first_entry(&dev
->adj_list
.upper
,
7047 struct netdev_adjacent
, list
);
7048 if (likely(upper
->master
) && !upper
->ignore
)
7054 * netdev_has_any_lower_dev - Check if device is linked to some device
7057 * Find out if a device is linked to a lower device and return true in case
7058 * it is. The caller must hold the RTNL lock.
7060 static bool netdev_has_any_lower_dev(struct net_device
*dev
)
7064 return !list_empty(&dev
->adj_list
.lower
);
7067 void *netdev_adjacent_get_private(struct list_head
*adj_list
)
7069 struct netdev_adjacent
*adj
;
7071 adj
= list_entry(adj_list
, struct netdev_adjacent
, list
);
7073 return adj
->private;
7075 EXPORT_SYMBOL(netdev_adjacent_get_private
);
7078 * netdev_upper_get_next_dev_rcu - Get the next dev from upper list
7080 * @iter: list_head ** of the current position
7082 * Gets the next device from the dev's upper list, starting from iter
7083 * position. The caller must hold RCU read lock.
7085 struct net_device
*netdev_upper_get_next_dev_rcu(struct net_device
*dev
,
7086 struct list_head
**iter
)
7088 struct netdev_adjacent
*upper
;
7090 WARN_ON_ONCE(!rcu_read_lock_held() && !lockdep_rtnl_is_held());
7092 upper
= list_entry_rcu((*iter
)->next
, struct netdev_adjacent
, list
);
7094 if (&upper
->list
== &dev
->adj_list
.upper
)
7097 *iter
= &upper
->list
;
7101 EXPORT_SYMBOL(netdev_upper_get_next_dev_rcu
);
7103 static struct net_device
*__netdev_next_upper_dev(struct net_device
*dev
,
7104 struct list_head
**iter
,
7107 struct netdev_adjacent
*upper
;
7109 upper
= list_entry((*iter
)->next
, struct netdev_adjacent
, list
);
7111 if (&upper
->list
== &dev
->adj_list
.upper
)
7114 *iter
= &upper
->list
;
7115 *ignore
= upper
->ignore
;
7120 static struct net_device
*netdev_next_upper_dev_rcu(struct net_device
*dev
,
7121 struct list_head
**iter
)
7123 struct netdev_adjacent
*upper
;
7125 WARN_ON_ONCE(!rcu_read_lock_held() && !lockdep_rtnl_is_held());
7127 upper
= list_entry_rcu((*iter
)->next
, struct netdev_adjacent
, list
);
7129 if (&upper
->list
== &dev
->adj_list
.upper
)
7132 *iter
= &upper
->list
;
7137 static int __netdev_walk_all_upper_dev(struct net_device
*dev
,
7138 int (*fn
)(struct net_device
*dev
,
7139 struct netdev_nested_priv
*priv
),
7140 struct netdev_nested_priv
*priv
)
7142 struct net_device
*udev
, *next
, *now
, *dev_stack
[MAX_NEST_DEV
+ 1];
7143 struct list_head
*niter
, *iter
, *iter_stack
[MAX_NEST_DEV
+ 1];
7148 iter
= &dev
->adj_list
.upper
;
7152 ret
= fn(now
, priv
);
7159 udev
= __netdev_next_upper_dev(now
, &iter
, &ignore
);
7166 niter
= &udev
->adj_list
.upper
;
7167 dev_stack
[cur
] = now
;
7168 iter_stack
[cur
++] = iter
;
7175 next
= dev_stack
[--cur
];
7176 niter
= iter_stack
[cur
];
7186 int netdev_walk_all_upper_dev_rcu(struct net_device
*dev
,
7187 int (*fn
)(struct net_device
*dev
,
7188 struct netdev_nested_priv
*priv
),
7189 struct netdev_nested_priv
*priv
)
7191 struct net_device
*udev
, *next
, *now
, *dev_stack
[MAX_NEST_DEV
+ 1];
7192 struct list_head
*niter
, *iter
, *iter_stack
[MAX_NEST_DEV
+ 1];
7196 iter
= &dev
->adj_list
.upper
;
7200 ret
= fn(now
, priv
);
7207 udev
= netdev_next_upper_dev_rcu(now
, &iter
);
7212 niter
= &udev
->adj_list
.upper
;
7213 dev_stack
[cur
] = now
;
7214 iter_stack
[cur
++] = iter
;
7221 next
= dev_stack
[--cur
];
7222 niter
= iter_stack
[cur
];
7231 EXPORT_SYMBOL_GPL(netdev_walk_all_upper_dev_rcu
);
7233 static bool __netdev_has_upper_dev(struct net_device
*dev
,
7234 struct net_device
*upper_dev
)
7236 struct netdev_nested_priv priv
= {
7238 .data
= (void *)upper_dev
,
7243 return __netdev_walk_all_upper_dev(dev
, ____netdev_has_upper_dev
,
7248 * netdev_lower_get_next_private - Get the next ->private from the
7249 * lower neighbour list
7251 * @iter: list_head ** of the current position
7253 * Gets the next netdev_adjacent->private from the dev's lower neighbour
7254 * list, starting from iter position. The caller must hold either hold the
7255 * RTNL lock or its own locking that guarantees that the neighbour lower
7256 * list will remain unchanged.
7258 void *netdev_lower_get_next_private(struct net_device
*dev
,
7259 struct list_head
**iter
)
7261 struct netdev_adjacent
*lower
;
7263 lower
= list_entry(*iter
, struct netdev_adjacent
, list
);
7265 if (&lower
->list
== &dev
->adj_list
.lower
)
7268 *iter
= lower
->list
.next
;
7270 return lower
->private;
7272 EXPORT_SYMBOL(netdev_lower_get_next_private
);
7275 * netdev_lower_get_next_private_rcu - Get the next ->private from the
7276 * lower neighbour list, RCU
7279 * @iter: list_head ** of the current position
7281 * Gets the next netdev_adjacent->private from the dev's lower neighbour
7282 * list, starting from iter position. The caller must hold RCU read lock.
7284 void *netdev_lower_get_next_private_rcu(struct net_device
*dev
,
7285 struct list_head
**iter
)
7287 struct netdev_adjacent
*lower
;
7289 WARN_ON_ONCE(!rcu_read_lock_held());
7291 lower
= list_entry_rcu((*iter
)->next
, struct netdev_adjacent
, list
);
7293 if (&lower
->list
== &dev
->adj_list
.lower
)
7296 *iter
= &lower
->list
;
7298 return lower
->private;
7300 EXPORT_SYMBOL(netdev_lower_get_next_private_rcu
);
7303 * netdev_lower_get_next - Get the next device from the lower neighbour
7306 * @iter: list_head ** of the current position
7308 * Gets the next netdev_adjacent from the dev's lower neighbour
7309 * list, starting from iter position. The caller must hold RTNL lock or
7310 * its own locking that guarantees that the neighbour lower
7311 * list will remain unchanged.
7313 void *netdev_lower_get_next(struct net_device
*dev
, struct list_head
**iter
)
7315 struct netdev_adjacent
*lower
;
7317 lower
= list_entry(*iter
, struct netdev_adjacent
, list
);
7319 if (&lower
->list
== &dev
->adj_list
.lower
)
7322 *iter
= lower
->list
.next
;
7326 EXPORT_SYMBOL(netdev_lower_get_next
);
7328 static struct net_device
*netdev_next_lower_dev(struct net_device
*dev
,
7329 struct list_head
**iter
)
7331 struct netdev_adjacent
*lower
;
7333 lower
= list_entry((*iter
)->next
, struct netdev_adjacent
, list
);
7335 if (&lower
->list
== &dev
->adj_list
.lower
)
7338 *iter
= &lower
->list
;
7343 static struct net_device
*__netdev_next_lower_dev(struct net_device
*dev
,
7344 struct list_head
**iter
,
7347 struct netdev_adjacent
*lower
;
7349 lower
= list_entry((*iter
)->next
, struct netdev_adjacent
, list
);
7351 if (&lower
->list
== &dev
->adj_list
.lower
)
7354 *iter
= &lower
->list
;
7355 *ignore
= lower
->ignore
;
7360 int netdev_walk_all_lower_dev(struct net_device
*dev
,
7361 int (*fn
)(struct net_device
*dev
,
7362 struct netdev_nested_priv
*priv
),
7363 struct netdev_nested_priv
*priv
)
7365 struct net_device
*ldev
, *next
, *now
, *dev_stack
[MAX_NEST_DEV
+ 1];
7366 struct list_head
*niter
, *iter
, *iter_stack
[MAX_NEST_DEV
+ 1];
7370 iter
= &dev
->adj_list
.lower
;
7374 ret
= fn(now
, priv
);
7381 ldev
= netdev_next_lower_dev(now
, &iter
);
7386 niter
= &ldev
->adj_list
.lower
;
7387 dev_stack
[cur
] = now
;
7388 iter_stack
[cur
++] = iter
;
7395 next
= dev_stack
[--cur
];
7396 niter
= iter_stack
[cur
];
7405 EXPORT_SYMBOL_GPL(netdev_walk_all_lower_dev
);
7407 static int __netdev_walk_all_lower_dev(struct net_device
*dev
,
7408 int (*fn
)(struct net_device
*dev
,
7409 struct netdev_nested_priv
*priv
),
7410 struct netdev_nested_priv
*priv
)
7412 struct net_device
*ldev
, *next
, *now
, *dev_stack
[MAX_NEST_DEV
+ 1];
7413 struct list_head
*niter
, *iter
, *iter_stack
[MAX_NEST_DEV
+ 1];
7418 iter
= &dev
->adj_list
.lower
;
7422 ret
= fn(now
, priv
);
7429 ldev
= __netdev_next_lower_dev(now
, &iter
, &ignore
);
7436 niter
= &ldev
->adj_list
.lower
;
7437 dev_stack
[cur
] = now
;
7438 iter_stack
[cur
++] = iter
;
7445 next
= dev_stack
[--cur
];
7446 niter
= iter_stack
[cur
];
7456 struct net_device
*netdev_next_lower_dev_rcu(struct net_device
*dev
,
7457 struct list_head
**iter
)
7459 struct netdev_adjacent
*lower
;
7461 lower
= list_entry_rcu((*iter
)->next
, struct netdev_adjacent
, list
);
7462 if (&lower
->list
== &dev
->adj_list
.lower
)
7465 *iter
= &lower
->list
;
7469 EXPORT_SYMBOL(netdev_next_lower_dev_rcu
);
7471 static u8
__netdev_upper_depth(struct net_device
*dev
)
7473 struct net_device
*udev
;
7474 struct list_head
*iter
;
7478 for (iter
= &dev
->adj_list
.upper
,
7479 udev
= __netdev_next_upper_dev(dev
, &iter
, &ignore
);
7481 udev
= __netdev_next_upper_dev(dev
, &iter
, &ignore
)) {
7484 if (max_depth
< udev
->upper_level
)
7485 max_depth
= udev
->upper_level
;
7491 static u8
__netdev_lower_depth(struct net_device
*dev
)
7493 struct net_device
*ldev
;
7494 struct list_head
*iter
;
7498 for (iter
= &dev
->adj_list
.lower
,
7499 ldev
= __netdev_next_lower_dev(dev
, &iter
, &ignore
);
7501 ldev
= __netdev_next_lower_dev(dev
, &iter
, &ignore
)) {
7504 if (max_depth
< ldev
->lower_level
)
7505 max_depth
= ldev
->lower_level
;
7511 static int __netdev_update_upper_level(struct net_device
*dev
,
7512 struct netdev_nested_priv
*__unused
)
7514 dev
->upper_level
= __netdev_upper_depth(dev
) + 1;
7518 static int __netdev_update_lower_level(struct net_device
*dev
,
7519 struct netdev_nested_priv
*priv
)
7521 dev
->lower_level
= __netdev_lower_depth(dev
) + 1;
7523 #ifdef CONFIG_LOCKDEP
7527 if (priv
->flags
& NESTED_SYNC_IMM
)
7528 dev
->nested_level
= dev
->lower_level
- 1;
7529 if (priv
->flags
& NESTED_SYNC_TODO
)
7530 net_unlink_todo(dev
);
7535 int netdev_walk_all_lower_dev_rcu(struct net_device
*dev
,
7536 int (*fn
)(struct net_device
*dev
,
7537 struct netdev_nested_priv
*priv
),
7538 struct netdev_nested_priv
*priv
)
7540 struct net_device
*ldev
, *next
, *now
, *dev_stack
[MAX_NEST_DEV
+ 1];
7541 struct list_head
*niter
, *iter
, *iter_stack
[MAX_NEST_DEV
+ 1];
7545 iter
= &dev
->adj_list
.lower
;
7549 ret
= fn(now
, priv
);
7556 ldev
= netdev_next_lower_dev_rcu(now
, &iter
);
7561 niter
= &ldev
->adj_list
.lower
;
7562 dev_stack
[cur
] = now
;
7563 iter_stack
[cur
++] = iter
;
7570 next
= dev_stack
[--cur
];
7571 niter
= iter_stack
[cur
];
7580 EXPORT_SYMBOL_GPL(netdev_walk_all_lower_dev_rcu
);
7583 * netdev_lower_get_first_private_rcu - Get the first ->private from the
7584 * lower neighbour list, RCU
7588 * Gets the first netdev_adjacent->private from the dev's lower neighbour
7589 * list. The caller must hold RCU read lock.
7591 void *netdev_lower_get_first_private_rcu(struct net_device
*dev
)
7593 struct netdev_adjacent
*lower
;
7595 lower
= list_first_or_null_rcu(&dev
->adj_list
.lower
,
7596 struct netdev_adjacent
, list
);
7598 return lower
->private;
7601 EXPORT_SYMBOL(netdev_lower_get_first_private_rcu
);
7604 * netdev_master_upper_dev_get_rcu - Get master upper device
7607 * Find a master upper device and return pointer to it or NULL in case
7608 * it's not there. The caller must hold the RCU read lock.
7610 struct net_device
*netdev_master_upper_dev_get_rcu(struct net_device
*dev
)
7612 struct netdev_adjacent
*upper
;
7614 upper
= list_first_or_null_rcu(&dev
->adj_list
.upper
,
7615 struct netdev_adjacent
, list
);
7616 if (upper
&& likely(upper
->master
))
7620 EXPORT_SYMBOL(netdev_master_upper_dev_get_rcu
);
7622 static int netdev_adjacent_sysfs_add(struct net_device
*dev
,
7623 struct net_device
*adj_dev
,
7624 struct list_head
*dev_list
)
7626 char linkname
[IFNAMSIZ
+7];
7628 sprintf(linkname
, dev_list
== &dev
->adj_list
.upper
?
7629 "upper_%s" : "lower_%s", adj_dev
->name
);
7630 return sysfs_create_link(&(dev
->dev
.kobj
), &(adj_dev
->dev
.kobj
),
7633 static void netdev_adjacent_sysfs_del(struct net_device
*dev
,
7635 struct list_head
*dev_list
)
7637 char linkname
[IFNAMSIZ
+7];
7639 sprintf(linkname
, dev_list
== &dev
->adj_list
.upper
?
7640 "upper_%s" : "lower_%s", name
);
7641 sysfs_remove_link(&(dev
->dev
.kobj
), linkname
);
7644 static inline bool netdev_adjacent_is_neigh_list(struct net_device
*dev
,
7645 struct net_device
*adj_dev
,
7646 struct list_head
*dev_list
)
7648 return (dev_list
== &dev
->adj_list
.upper
||
7649 dev_list
== &dev
->adj_list
.lower
) &&
7650 net_eq(dev_net(dev
), dev_net(adj_dev
));
7653 static int __netdev_adjacent_dev_insert(struct net_device
*dev
,
7654 struct net_device
*adj_dev
,
7655 struct list_head
*dev_list
,
7656 void *private, bool master
)
7658 struct netdev_adjacent
*adj
;
7661 adj
= __netdev_find_adj(adj_dev
, dev_list
);
7665 pr_debug("Insert adjacency: dev %s adj_dev %s adj->ref_nr %d\n",
7666 dev
->name
, adj_dev
->name
, adj
->ref_nr
);
7671 adj
= kmalloc(sizeof(*adj
), GFP_KERNEL
);
7676 adj
->master
= master
;
7678 adj
->private = private;
7679 adj
->ignore
= false;
7682 pr_debug("Insert adjacency: dev %s adj_dev %s adj->ref_nr %d; dev_hold on %s\n",
7683 dev
->name
, adj_dev
->name
, adj
->ref_nr
, adj_dev
->name
);
7685 if (netdev_adjacent_is_neigh_list(dev
, adj_dev
, dev_list
)) {
7686 ret
= netdev_adjacent_sysfs_add(dev
, adj_dev
, dev_list
);
7691 /* Ensure that master link is always the first item in list. */
7693 ret
= sysfs_create_link(&(dev
->dev
.kobj
),
7694 &(adj_dev
->dev
.kobj
), "master");
7696 goto remove_symlinks
;
7698 list_add_rcu(&adj
->list
, dev_list
);
7700 list_add_tail_rcu(&adj
->list
, dev_list
);
7706 if (netdev_adjacent_is_neigh_list(dev
, adj_dev
, dev_list
))
7707 netdev_adjacent_sysfs_del(dev
, adj_dev
->name
, dev_list
);
7715 static void __netdev_adjacent_dev_remove(struct net_device
*dev
,
7716 struct net_device
*adj_dev
,
7718 struct list_head
*dev_list
)
7720 struct netdev_adjacent
*adj
;
7722 pr_debug("Remove adjacency: dev %s adj_dev %s ref_nr %d\n",
7723 dev
->name
, adj_dev
->name
, ref_nr
);
7725 adj
= __netdev_find_adj(adj_dev
, dev_list
);
7728 pr_err("Adjacency does not exist for device %s from %s\n",
7729 dev
->name
, adj_dev
->name
);
7734 if (adj
->ref_nr
> ref_nr
) {
7735 pr_debug("adjacency: %s to %s ref_nr - %d = %d\n",
7736 dev
->name
, adj_dev
->name
, ref_nr
,
7737 adj
->ref_nr
- ref_nr
);
7738 adj
->ref_nr
-= ref_nr
;
7743 sysfs_remove_link(&(dev
->dev
.kobj
), "master");
7745 if (netdev_adjacent_is_neigh_list(dev
, adj_dev
, dev_list
))
7746 netdev_adjacent_sysfs_del(dev
, adj_dev
->name
, dev_list
);
7748 list_del_rcu(&adj
->list
);
7749 pr_debug("adjacency: dev_put for %s, because link removed from %s to %s\n",
7750 adj_dev
->name
, dev
->name
, adj_dev
->name
);
7752 kfree_rcu(adj
, rcu
);
7755 static int __netdev_adjacent_dev_link_lists(struct net_device
*dev
,
7756 struct net_device
*upper_dev
,
7757 struct list_head
*up_list
,
7758 struct list_head
*down_list
,
7759 void *private, bool master
)
7763 ret
= __netdev_adjacent_dev_insert(dev
, upper_dev
, up_list
,
7768 ret
= __netdev_adjacent_dev_insert(upper_dev
, dev
, down_list
,
7771 __netdev_adjacent_dev_remove(dev
, upper_dev
, 1, up_list
);
7778 static void __netdev_adjacent_dev_unlink_lists(struct net_device
*dev
,
7779 struct net_device
*upper_dev
,
7781 struct list_head
*up_list
,
7782 struct list_head
*down_list
)
7784 __netdev_adjacent_dev_remove(dev
, upper_dev
, ref_nr
, up_list
);
7785 __netdev_adjacent_dev_remove(upper_dev
, dev
, ref_nr
, down_list
);
7788 static int __netdev_adjacent_dev_link_neighbour(struct net_device
*dev
,
7789 struct net_device
*upper_dev
,
7790 void *private, bool master
)
7792 return __netdev_adjacent_dev_link_lists(dev
, upper_dev
,
7793 &dev
->adj_list
.upper
,
7794 &upper_dev
->adj_list
.lower
,
7798 static void __netdev_adjacent_dev_unlink_neighbour(struct net_device
*dev
,
7799 struct net_device
*upper_dev
)
7801 __netdev_adjacent_dev_unlink_lists(dev
, upper_dev
, 1,
7802 &dev
->adj_list
.upper
,
7803 &upper_dev
->adj_list
.lower
);
7806 static int __netdev_upper_dev_link(struct net_device
*dev
,
7807 struct net_device
*upper_dev
, bool master
,
7808 void *upper_priv
, void *upper_info
,
7809 struct netdev_nested_priv
*priv
,
7810 struct netlink_ext_ack
*extack
)
7812 struct netdev_notifier_changeupper_info changeupper_info
= {
7817 .upper_dev
= upper_dev
,
7820 .upper_info
= upper_info
,
7822 struct net_device
*master_dev
;
7827 if (dev
== upper_dev
)
7830 /* To prevent loops, check if dev is not upper device to upper_dev. */
7831 if (__netdev_has_upper_dev(upper_dev
, dev
))
7834 if ((dev
->lower_level
+ upper_dev
->upper_level
) > MAX_NEST_DEV
)
7838 if (__netdev_has_upper_dev(dev
, upper_dev
))
7841 master_dev
= __netdev_master_upper_dev_get(dev
);
7843 return master_dev
== upper_dev
? -EEXIST
: -EBUSY
;
7846 ret
= call_netdevice_notifiers_info(NETDEV_PRECHANGEUPPER
,
7847 &changeupper_info
.info
);
7848 ret
= notifier_to_errno(ret
);
7852 ret
= __netdev_adjacent_dev_link_neighbour(dev
, upper_dev
, upper_priv
,
7857 ret
= call_netdevice_notifiers_info(NETDEV_CHANGEUPPER
,
7858 &changeupper_info
.info
);
7859 ret
= notifier_to_errno(ret
);
7863 __netdev_update_upper_level(dev
, NULL
);
7864 __netdev_walk_all_lower_dev(dev
, __netdev_update_upper_level
, NULL
);
7866 __netdev_update_lower_level(upper_dev
, priv
);
7867 __netdev_walk_all_upper_dev(upper_dev
, __netdev_update_lower_level
,
7873 __netdev_adjacent_dev_unlink_neighbour(dev
, upper_dev
);
7879 * netdev_upper_dev_link - Add a link to the upper device
7881 * @upper_dev: new upper device
7882 * @extack: netlink extended ack
7884 * Adds a link to device which is upper to this one. The caller must hold
7885 * the RTNL lock. On a failure a negative errno code is returned.
7886 * On success the reference counts are adjusted and the function
7889 int netdev_upper_dev_link(struct net_device
*dev
,
7890 struct net_device
*upper_dev
,
7891 struct netlink_ext_ack
*extack
)
7893 struct netdev_nested_priv priv
= {
7894 .flags
= NESTED_SYNC_IMM
| NESTED_SYNC_TODO
,
7898 return __netdev_upper_dev_link(dev
, upper_dev
, false,
7899 NULL
, NULL
, &priv
, extack
);
7901 EXPORT_SYMBOL(netdev_upper_dev_link
);
7904 * netdev_master_upper_dev_link - Add a master link to the upper device
7906 * @upper_dev: new upper device
7907 * @upper_priv: upper device private
7908 * @upper_info: upper info to be passed down via notifier
7909 * @extack: netlink extended ack
7911 * Adds a link to device which is upper to this one. In this case, only
7912 * one master upper device can be linked, although other non-master devices
7913 * might be linked as well. The caller must hold the RTNL lock.
7914 * On a failure a negative errno code is returned. On success the reference
7915 * counts are adjusted and the function returns zero.
7917 int netdev_master_upper_dev_link(struct net_device
*dev
,
7918 struct net_device
*upper_dev
,
7919 void *upper_priv
, void *upper_info
,
7920 struct netlink_ext_ack
*extack
)
7922 struct netdev_nested_priv priv
= {
7923 .flags
= NESTED_SYNC_IMM
| NESTED_SYNC_TODO
,
7927 return __netdev_upper_dev_link(dev
, upper_dev
, true,
7928 upper_priv
, upper_info
, &priv
, extack
);
7930 EXPORT_SYMBOL(netdev_master_upper_dev_link
);
7932 static void __netdev_upper_dev_unlink(struct net_device
*dev
,
7933 struct net_device
*upper_dev
,
7934 struct netdev_nested_priv
*priv
)
7936 struct netdev_notifier_changeupper_info changeupper_info
= {
7940 .upper_dev
= upper_dev
,
7946 changeupper_info
.master
= netdev_master_upper_dev_get(dev
) == upper_dev
;
7948 call_netdevice_notifiers_info(NETDEV_PRECHANGEUPPER
,
7949 &changeupper_info
.info
);
7951 __netdev_adjacent_dev_unlink_neighbour(dev
, upper_dev
);
7953 call_netdevice_notifiers_info(NETDEV_CHANGEUPPER
,
7954 &changeupper_info
.info
);
7956 __netdev_update_upper_level(dev
, NULL
);
7957 __netdev_walk_all_lower_dev(dev
, __netdev_update_upper_level
, NULL
);
7959 __netdev_update_lower_level(upper_dev
, priv
);
7960 __netdev_walk_all_upper_dev(upper_dev
, __netdev_update_lower_level
,
7965 * netdev_upper_dev_unlink - Removes a link to upper device
7967 * @upper_dev: new upper device
7969 * Removes a link to device which is upper to this one. The caller must hold
7972 void netdev_upper_dev_unlink(struct net_device
*dev
,
7973 struct net_device
*upper_dev
)
7975 struct netdev_nested_priv priv
= {
7976 .flags
= NESTED_SYNC_TODO
,
7980 __netdev_upper_dev_unlink(dev
, upper_dev
, &priv
);
7982 EXPORT_SYMBOL(netdev_upper_dev_unlink
);
7984 static void __netdev_adjacent_dev_set(struct net_device
*upper_dev
,
7985 struct net_device
*lower_dev
,
7988 struct netdev_adjacent
*adj
;
7990 adj
= __netdev_find_adj(lower_dev
, &upper_dev
->adj_list
.lower
);
7994 adj
= __netdev_find_adj(upper_dev
, &lower_dev
->adj_list
.upper
);
7999 static void netdev_adjacent_dev_disable(struct net_device
*upper_dev
,
8000 struct net_device
*lower_dev
)
8002 __netdev_adjacent_dev_set(upper_dev
, lower_dev
, true);
8005 static void netdev_adjacent_dev_enable(struct net_device
*upper_dev
,
8006 struct net_device
*lower_dev
)
8008 __netdev_adjacent_dev_set(upper_dev
, lower_dev
, false);
8011 int netdev_adjacent_change_prepare(struct net_device
*old_dev
,
8012 struct net_device
*new_dev
,
8013 struct net_device
*dev
,
8014 struct netlink_ext_ack
*extack
)
8016 struct netdev_nested_priv priv
= {
8025 if (old_dev
&& new_dev
!= old_dev
)
8026 netdev_adjacent_dev_disable(dev
, old_dev
);
8027 err
= __netdev_upper_dev_link(new_dev
, dev
, false, NULL
, NULL
, &priv
,
8030 if (old_dev
&& new_dev
!= old_dev
)
8031 netdev_adjacent_dev_enable(dev
, old_dev
);
8037 EXPORT_SYMBOL(netdev_adjacent_change_prepare
);
8039 void netdev_adjacent_change_commit(struct net_device
*old_dev
,
8040 struct net_device
*new_dev
,
8041 struct net_device
*dev
)
8043 struct netdev_nested_priv priv
= {
8044 .flags
= NESTED_SYNC_IMM
| NESTED_SYNC_TODO
,
8048 if (!new_dev
|| !old_dev
)
8051 if (new_dev
== old_dev
)
8054 netdev_adjacent_dev_enable(dev
, old_dev
);
8055 __netdev_upper_dev_unlink(old_dev
, dev
, &priv
);
8057 EXPORT_SYMBOL(netdev_adjacent_change_commit
);
8059 void netdev_adjacent_change_abort(struct net_device
*old_dev
,
8060 struct net_device
*new_dev
,
8061 struct net_device
*dev
)
8063 struct netdev_nested_priv priv
= {
8071 if (old_dev
&& new_dev
!= old_dev
)
8072 netdev_adjacent_dev_enable(dev
, old_dev
);
8074 __netdev_upper_dev_unlink(new_dev
, dev
, &priv
);
8076 EXPORT_SYMBOL(netdev_adjacent_change_abort
);
8079 * netdev_bonding_info_change - Dispatch event about slave change
8081 * @bonding_info: info to dispatch
8083 * Send NETDEV_BONDING_INFO to netdev notifiers with info.
8084 * The caller must hold the RTNL lock.
8086 void netdev_bonding_info_change(struct net_device
*dev
,
8087 struct netdev_bonding_info
*bonding_info
)
8089 struct netdev_notifier_bonding_info info
= {
8093 memcpy(&info
.bonding_info
, bonding_info
,
8094 sizeof(struct netdev_bonding_info
));
8095 call_netdevice_notifiers_info(NETDEV_BONDING_INFO
,
8098 EXPORT_SYMBOL(netdev_bonding_info_change
);
8101 * netdev_get_xmit_slave - Get the xmit slave of master device
8104 * @all_slaves: assume all the slaves are active
8106 * The reference counters are not incremented so the caller must be
8107 * careful with locks. The caller must hold RCU lock.
8108 * %NULL is returned if no slave is found.
8111 struct net_device
*netdev_get_xmit_slave(struct net_device
*dev
,
8112 struct sk_buff
*skb
,
8115 const struct net_device_ops
*ops
= dev
->netdev_ops
;
8117 if (!ops
->ndo_get_xmit_slave
)
8119 return ops
->ndo_get_xmit_slave(dev
, skb
, all_slaves
);
8121 EXPORT_SYMBOL(netdev_get_xmit_slave
);
8123 static void netdev_adjacent_add_links(struct net_device
*dev
)
8125 struct netdev_adjacent
*iter
;
8127 struct net
*net
= dev_net(dev
);
8129 list_for_each_entry(iter
, &dev
->adj_list
.upper
, list
) {
8130 if (!net_eq(net
, dev_net(iter
->dev
)))
8132 netdev_adjacent_sysfs_add(iter
->dev
, dev
,
8133 &iter
->dev
->adj_list
.lower
);
8134 netdev_adjacent_sysfs_add(dev
, iter
->dev
,
8135 &dev
->adj_list
.upper
);
8138 list_for_each_entry(iter
, &dev
->adj_list
.lower
, list
) {
8139 if (!net_eq(net
, dev_net(iter
->dev
)))
8141 netdev_adjacent_sysfs_add(iter
->dev
, dev
,
8142 &iter
->dev
->adj_list
.upper
);
8143 netdev_adjacent_sysfs_add(dev
, iter
->dev
,
8144 &dev
->adj_list
.lower
);
8148 static void netdev_adjacent_del_links(struct net_device
*dev
)
8150 struct netdev_adjacent
*iter
;
8152 struct net
*net
= dev_net(dev
);
8154 list_for_each_entry(iter
, &dev
->adj_list
.upper
, list
) {
8155 if (!net_eq(net
, dev_net(iter
->dev
)))
8157 netdev_adjacent_sysfs_del(iter
->dev
, dev
->name
,
8158 &iter
->dev
->adj_list
.lower
);
8159 netdev_adjacent_sysfs_del(dev
, iter
->dev
->name
,
8160 &dev
->adj_list
.upper
);
8163 list_for_each_entry(iter
, &dev
->adj_list
.lower
, list
) {
8164 if (!net_eq(net
, dev_net(iter
->dev
)))
8166 netdev_adjacent_sysfs_del(iter
->dev
, dev
->name
,
8167 &iter
->dev
->adj_list
.upper
);
8168 netdev_adjacent_sysfs_del(dev
, iter
->dev
->name
,
8169 &dev
->adj_list
.lower
);
8173 void netdev_adjacent_rename_links(struct net_device
*dev
, char *oldname
)
8175 struct netdev_adjacent
*iter
;
8177 struct net
*net
= dev_net(dev
);
8179 list_for_each_entry(iter
, &dev
->adj_list
.upper
, list
) {
8180 if (!net_eq(net
, dev_net(iter
->dev
)))
8182 netdev_adjacent_sysfs_del(iter
->dev
, oldname
,
8183 &iter
->dev
->adj_list
.lower
);
8184 netdev_adjacent_sysfs_add(iter
->dev
, dev
,
8185 &iter
->dev
->adj_list
.lower
);
8188 list_for_each_entry(iter
, &dev
->adj_list
.lower
, list
) {
8189 if (!net_eq(net
, dev_net(iter
->dev
)))
8191 netdev_adjacent_sysfs_del(iter
->dev
, oldname
,
8192 &iter
->dev
->adj_list
.upper
);
8193 netdev_adjacent_sysfs_add(iter
->dev
, dev
,
8194 &iter
->dev
->adj_list
.upper
);
8198 void *netdev_lower_dev_get_private(struct net_device
*dev
,
8199 struct net_device
*lower_dev
)
8201 struct netdev_adjacent
*lower
;
8205 lower
= __netdev_find_adj(lower_dev
, &dev
->adj_list
.lower
);
8209 return lower
->private;
8211 EXPORT_SYMBOL(netdev_lower_dev_get_private
);
8215 * netdev_lower_state_changed - Dispatch event about lower device state change
8216 * @lower_dev: device
8217 * @lower_state_info: state to dispatch
8219 * Send NETDEV_CHANGELOWERSTATE to netdev notifiers with info.
8220 * The caller must hold the RTNL lock.
8222 void netdev_lower_state_changed(struct net_device
*lower_dev
,
8223 void *lower_state_info
)
8225 struct netdev_notifier_changelowerstate_info changelowerstate_info
= {
8226 .info
.dev
= lower_dev
,
8230 changelowerstate_info
.lower_state_info
= lower_state_info
;
8231 call_netdevice_notifiers_info(NETDEV_CHANGELOWERSTATE
,
8232 &changelowerstate_info
.info
);
8234 EXPORT_SYMBOL(netdev_lower_state_changed
);
8236 static void dev_change_rx_flags(struct net_device
*dev
, int flags
)
8238 const struct net_device_ops
*ops
= dev
->netdev_ops
;
8240 if (ops
->ndo_change_rx_flags
)
8241 ops
->ndo_change_rx_flags(dev
, flags
);
8244 static int __dev_set_promiscuity(struct net_device
*dev
, int inc
, bool notify
)
8246 unsigned int old_flags
= dev
->flags
;
8252 dev
->flags
|= IFF_PROMISC
;
8253 dev
->promiscuity
+= inc
;
8254 if (dev
->promiscuity
== 0) {
8257 * If inc causes overflow, untouch promisc and return error.
8260 dev
->flags
&= ~IFF_PROMISC
;
8262 dev
->promiscuity
-= inc
;
8263 pr_warn("%s: promiscuity touches roof, set promiscuity failed. promiscuity feature of device might be broken.\n",
8268 if (dev
->flags
!= old_flags
) {
8269 pr_info("device %s %s promiscuous mode\n",
8271 dev
->flags
& IFF_PROMISC
? "entered" : "left");
8272 if (audit_enabled
) {
8273 current_uid_gid(&uid
, &gid
);
8274 audit_log(audit_context(), GFP_ATOMIC
,
8275 AUDIT_ANOM_PROMISCUOUS
,
8276 "dev=%s prom=%d old_prom=%d auid=%u uid=%u gid=%u ses=%u",
8277 dev
->name
, (dev
->flags
& IFF_PROMISC
),
8278 (old_flags
& IFF_PROMISC
),
8279 from_kuid(&init_user_ns
, audit_get_loginuid(current
)),
8280 from_kuid(&init_user_ns
, uid
),
8281 from_kgid(&init_user_ns
, gid
),
8282 audit_get_sessionid(current
));
8285 dev_change_rx_flags(dev
, IFF_PROMISC
);
8288 __dev_notify_flags(dev
, old_flags
, IFF_PROMISC
);
8293 * dev_set_promiscuity - update promiscuity count on a device
8297 * Add or remove promiscuity from a device. While the count in the device
8298 * remains above zero the interface remains promiscuous. Once it hits zero
8299 * the device reverts back to normal filtering operation. A negative inc
8300 * value is used to drop promiscuity on the device.
8301 * Return 0 if successful or a negative errno code on error.
8303 int dev_set_promiscuity(struct net_device
*dev
, int inc
)
8305 unsigned int old_flags
= dev
->flags
;
8308 err
= __dev_set_promiscuity(dev
, inc
, true);
8311 if (dev
->flags
!= old_flags
)
8312 dev_set_rx_mode(dev
);
8315 EXPORT_SYMBOL(dev_set_promiscuity
);
8317 static int __dev_set_allmulti(struct net_device
*dev
, int inc
, bool notify
)
8319 unsigned int old_flags
= dev
->flags
, old_gflags
= dev
->gflags
;
8323 dev
->flags
|= IFF_ALLMULTI
;
8324 dev
->allmulti
+= inc
;
8325 if (dev
->allmulti
== 0) {
8328 * If inc causes overflow, untouch allmulti and return error.
8331 dev
->flags
&= ~IFF_ALLMULTI
;
8333 dev
->allmulti
-= inc
;
8334 pr_warn("%s: allmulti touches roof, set allmulti failed. allmulti feature of device might be broken.\n",
8339 if (dev
->flags
^ old_flags
) {
8340 dev_change_rx_flags(dev
, IFF_ALLMULTI
);
8341 dev_set_rx_mode(dev
);
8343 __dev_notify_flags(dev
, old_flags
,
8344 dev
->gflags
^ old_gflags
);
8350 * dev_set_allmulti - update allmulti count on a device
8354 * Add or remove reception of all multicast frames to a device. While the
8355 * count in the device remains above zero the interface remains listening
8356 * to all interfaces. Once it hits zero the device reverts back to normal
8357 * filtering operation. A negative @inc value is used to drop the counter
8358 * when releasing a resource needing all multicasts.
8359 * Return 0 if successful or a negative errno code on error.
8362 int dev_set_allmulti(struct net_device
*dev
, int inc
)
8364 return __dev_set_allmulti(dev
, inc
, true);
8366 EXPORT_SYMBOL(dev_set_allmulti
);
8369 * Upload unicast and multicast address lists to device and
8370 * configure RX filtering. When the device doesn't support unicast
8371 * filtering it is put in promiscuous mode while unicast addresses
8374 void __dev_set_rx_mode(struct net_device
*dev
)
8376 const struct net_device_ops
*ops
= dev
->netdev_ops
;
8378 /* dev_open will call this function so the list will stay sane. */
8379 if (!(dev
->flags
&IFF_UP
))
8382 if (!netif_device_present(dev
))
8385 if (!(dev
->priv_flags
& IFF_UNICAST_FLT
)) {
8386 /* Unicast addresses changes may only happen under the rtnl,
8387 * therefore calling __dev_set_promiscuity here is safe.
8389 if (!netdev_uc_empty(dev
) && !dev
->uc_promisc
) {
8390 __dev_set_promiscuity(dev
, 1, false);
8391 dev
->uc_promisc
= true;
8392 } else if (netdev_uc_empty(dev
) && dev
->uc_promisc
) {
8393 __dev_set_promiscuity(dev
, -1, false);
8394 dev
->uc_promisc
= false;
8398 if (ops
->ndo_set_rx_mode
)
8399 ops
->ndo_set_rx_mode(dev
);
8402 void dev_set_rx_mode(struct net_device
*dev
)
8404 netif_addr_lock_bh(dev
);
8405 __dev_set_rx_mode(dev
);
8406 netif_addr_unlock_bh(dev
);
8410 * dev_get_flags - get flags reported to userspace
8413 * Get the combination of flag bits exported through APIs to userspace.
8415 unsigned int dev_get_flags(const struct net_device
*dev
)
8419 flags
= (dev
->flags
& ~(IFF_PROMISC
|
8424 (dev
->gflags
& (IFF_PROMISC
|
8427 if (netif_running(dev
)) {
8428 if (netif_oper_up(dev
))
8429 flags
|= IFF_RUNNING
;
8430 if (netif_carrier_ok(dev
))
8431 flags
|= IFF_LOWER_UP
;
8432 if (netif_dormant(dev
))
8433 flags
|= IFF_DORMANT
;
8438 EXPORT_SYMBOL(dev_get_flags
);
8440 int __dev_change_flags(struct net_device
*dev
, unsigned int flags
,
8441 struct netlink_ext_ack
*extack
)
8443 unsigned int old_flags
= dev
->flags
;
8449 * Set the flags on our device.
8452 dev
->flags
= (flags
& (IFF_DEBUG
| IFF_NOTRAILERS
| IFF_NOARP
|
8453 IFF_DYNAMIC
| IFF_MULTICAST
| IFF_PORTSEL
|
8455 (dev
->flags
& (IFF_UP
| IFF_VOLATILE
| IFF_PROMISC
|
8459 * Load in the correct multicast list now the flags have changed.
8462 if ((old_flags
^ flags
) & IFF_MULTICAST
)
8463 dev_change_rx_flags(dev
, IFF_MULTICAST
);
8465 dev_set_rx_mode(dev
);
8468 * Have we downed the interface. We handle IFF_UP ourselves
8469 * according to user attempts to set it, rather than blindly
8474 if ((old_flags
^ flags
) & IFF_UP
) {
8475 if (old_flags
& IFF_UP
)
8478 ret
= __dev_open(dev
, extack
);
8481 if ((flags
^ dev
->gflags
) & IFF_PROMISC
) {
8482 int inc
= (flags
& IFF_PROMISC
) ? 1 : -1;
8483 unsigned int old_flags
= dev
->flags
;
8485 dev
->gflags
^= IFF_PROMISC
;
8487 if (__dev_set_promiscuity(dev
, inc
, false) >= 0)
8488 if (dev
->flags
!= old_flags
)
8489 dev_set_rx_mode(dev
);
8492 /* NOTE: order of synchronization of IFF_PROMISC and IFF_ALLMULTI
8493 * is important. Some (broken) drivers set IFF_PROMISC, when
8494 * IFF_ALLMULTI is requested not asking us and not reporting.
8496 if ((flags
^ dev
->gflags
) & IFF_ALLMULTI
) {
8497 int inc
= (flags
& IFF_ALLMULTI
) ? 1 : -1;
8499 dev
->gflags
^= IFF_ALLMULTI
;
8500 __dev_set_allmulti(dev
, inc
, false);
8506 void __dev_notify_flags(struct net_device
*dev
, unsigned int old_flags
,
8507 unsigned int gchanges
)
8509 unsigned int changes
= dev
->flags
^ old_flags
;
8512 rtmsg_ifinfo(RTM_NEWLINK
, dev
, gchanges
, GFP_ATOMIC
);
8514 if (changes
& IFF_UP
) {
8515 if (dev
->flags
& IFF_UP
)
8516 call_netdevice_notifiers(NETDEV_UP
, dev
);
8518 call_netdevice_notifiers(NETDEV_DOWN
, dev
);
8521 if (dev
->flags
& IFF_UP
&&
8522 (changes
& ~(IFF_UP
| IFF_PROMISC
| IFF_ALLMULTI
| IFF_VOLATILE
))) {
8523 struct netdev_notifier_change_info change_info
= {
8527 .flags_changed
= changes
,
8530 call_netdevice_notifiers_info(NETDEV_CHANGE
, &change_info
.info
);
8535 * dev_change_flags - change device settings
8537 * @flags: device state flags
8538 * @extack: netlink extended ack
8540 * Change settings on device based state flags. The flags are
8541 * in the userspace exported format.
8543 int dev_change_flags(struct net_device
*dev
, unsigned int flags
,
8544 struct netlink_ext_ack
*extack
)
8547 unsigned int changes
, old_flags
= dev
->flags
, old_gflags
= dev
->gflags
;
8549 ret
= __dev_change_flags(dev
, flags
, extack
);
8553 changes
= (old_flags
^ dev
->flags
) | (old_gflags
^ dev
->gflags
);
8554 __dev_notify_flags(dev
, old_flags
, changes
);
8557 EXPORT_SYMBOL(dev_change_flags
);
8559 int __dev_set_mtu(struct net_device
*dev
, int new_mtu
)
8561 const struct net_device_ops
*ops
= dev
->netdev_ops
;
8563 if (ops
->ndo_change_mtu
)
8564 return ops
->ndo_change_mtu(dev
, new_mtu
);
8566 /* Pairs with all the lockless reads of dev->mtu in the stack */
8567 WRITE_ONCE(dev
->mtu
, new_mtu
);
8570 EXPORT_SYMBOL(__dev_set_mtu
);
8572 int dev_validate_mtu(struct net_device
*dev
, int new_mtu
,
8573 struct netlink_ext_ack
*extack
)
8575 /* MTU must be positive, and in range */
8576 if (new_mtu
< 0 || new_mtu
< dev
->min_mtu
) {
8577 NL_SET_ERR_MSG(extack
, "mtu less than device minimum");
8581 if (dev
->max_mtu
> 0 && new_mtu
> dev
->max_mtu
) {
8582 NL_SET_ERR_MSG(extack
, "mtu greater than device maximum");
8589 * dev_set_mtu_ext - Change maximum transfer unit
8591 * @new_mtu: new transfer unit
8592 * @extack: netlink extended ack
8594 * Change the maximum transfer size of the network device.
8596 int dev_set_mtu_ext(struct net_device
*dev
, int new_mtu
,
8597 struct netlink_ext_ack
*extack
)
8601 if (new_mtu
== dev
->mtu
)
8604 err
= dev_validate_mtu(dev
, new_mtu
, extack
);
8608 if (!netif_device_present(dev
))
8611 err
= call_netdevice_notifiers(NETDEV_PRECHANGEMTU
, dev
);
8612 err
= notifier_to_errno(err
);
8616 orig_mtu
= dev
->mtu
;
8617 err
= __dev_set_mtu(dev
, new_mtu
);
8620 err
= call_netdevice_notifiers_mtu(NETDEV_CHANGEMTU
, dev
,
8622 err
= notifier_to_errno(err
);
8624 /* setting mtu back and notifying everyone again,
8625 * so that they have a chance to revert changes.
8627 __dev_set_mtu(dev
, orig_mtu
);
8628 call_netdevice_notifiers_mtu(NETDEV_CHANGEMTU
, dev
,
8635 int dev_set_mtu(struct net_device
*dev
, int new_mtu
)
8637 struct netlink_ext_ack extack
;
8640 memset(&extack
, 0, sizeof(extack
));
8641 err
= dev_set_mtu_ext(dev
, new_mtu
, &extack
);
8642 if (err
&& extack
._msg
)
8643 net_err_ratelimited("%s: %s\n", dev
->name
, extack
._msg
);
8646 EXPORT_SYMBOL(dev_set_mtu
);
8649 * dev_change_tx_queue_len - Change TX queue length of a netdevice
8651 * @new_len: new tx queue length
8653 int dev_change_tx_queue_len(struct net_device
*dev
, unsigned long new_len
)
8655 unsigned int orig_len
= dev
->tx_queue_len
;
8658 if (new_len
!= (unsigned int)new_len
)
8661 if (new_len
!= orig_len
) {
8662 dev
->tx_queue_len
= new_len
;
8663 res
= call_netdevice_notifiers(NETDEV_CHANGE_TX_QUEUE_LEN
, dev
);
8664 res
= notifier_to_errno(res
);
8667 res
= dev_qdisc_change_tx_queue_len(dev
);
8675 netdev_err(dev
, "refused to change device tx_queue_len\n");
8676 dev
->tx_queue_len
= orig_len
;
8681 * dev_set_group - Change group this device belongs to
8683 * @new_group: group this device should belong to
8685 void dev_set_group(struct net_device
*dev
, int new_group
)
8687 dev
->group
= new_group
;
8689 EXPORT_SYMBOL(dev_set_group
);
8692 * dev_pre_changeaddr_notify - Call NETDEV_PRE_CHANGEADDR.
8694 * @addr: new address
8695 * @extack: netlink extended ack
8697 int dev_pre_changeaddr_notify(struct net_device
*dev
, const char *addr
,
8698 struct netlink_ext_ack
*extack
)
8700 struct netdev_notifier_pre_changeaddr_info info
= {
8702 .info
.extack
= extack
,
8707 rc
= call_netdevice_notifiers_info(NETDEV_PRE_CHANGEADDR
, &info
.info
);
8708 return notifier_to_errno(rc
);
8710 EXPORT_SYMBOL(dev_pre_changeaddr_notify
);
8713 * dev_set_mac_address - Change Media Access Control Address
8716 * @extack: netlink extended ack
8718 * Change the hardware (MAC) address of the device
8720 int dev_set_mac_address(struct net_device
*dev
, struct sockaddr
*sa
,
8721 struct netlink_ext_ack
*extack
)
8723 const struct net_device_ops
*ops
= dev
->netdev_ops
;
8726 if (!ops
->ndo_set_mac_address
)
8728 if (sa
->sa_family
!= dev
->type
)
8730 if (!netif_device_present(dev
))
8732 err
= dev_pre_changeaddr_notify(dev
, sa
->sa_data
, extack
);
8735 err
= ops
->ndo_set_mac_address(dev
, sa
);
8738 dev
->addr_assign_type
= NET_ADDR_SET
;
8739 call_netdevice_notifiers(NETDEV_CHANGEADDR
, dev
);
8740 add_device_randomness(dev
->dev_addr
, dev
->addr_len
);
8743 EXPORT_SYMBOL(dev_set_mac_address
);
8746 * dev_change_carrier - Change device carrier
8748 * @new_carrier: new value
8750 * Change device carrier
8752 int dev_change_carrier(struct net_device
*dev
, bool new_carrier
)
8754 const struct net_device_ops
*ops
= dev
->netdev_ops
;
8756 if (!ops
->ndo_change_carrier
)
8758 if (!netif_device_present(dev
))
8760 return ops
->ndo_change_carrier(dev
, new_carrier
);
8762 EXPORT_SYMBOL(dev_change_carrier
);
8765 * dev_get_phys_port_id - Get device physical port ID
8769 * Get device physical port ID
8771 int dev_get_phys_port_id(struct net_device
*dev
,
8772 struct netdev_phys_item_id
*ppid
)
8774 const struct net_device_ops
*ops
= dev
->netdev_ops
;
8776 if (!ops
->ndo_get_phys_port_id
)
8778 return ops
->ndo_get_phys_port_id(dev
, ppid
);
8780 EXPORT_SYMBOL(dev_get_phys_port_id
);
8783 * dev_get_phys_port_name - Get device physical port name
8786 * @len: limit of bytes to copy to name
8788 * Get device physical port name
8790 int dev_get_phys_port_name(struct net_device
*dev
,
8791 char *name
, size_t len
)
8793 const struct net_device_ops
*ops
= dev
->netdev_ops
;
8796 if (ops
->ndo_get_phys_port_name
) {
8797 err
= ops
->ndo_get_phys_port_name(dev
, name
, len
);
8798 if (err
!= -EOPNOTSUPP
)
8801 return devlink_compat_phys_port_name_get(dev
, name
, len
);
8803 EXPORT_SYMBOL(dev_get_phys_port_name
);
8806 * dev_get_port_parent_id - Get the device's port parent identifier
8807 * @dev: network device
8808 * @ppid: pointer to a storage for the port's parent identifier
8809 * @recurse: allow/disallow recursion to lower devices
8811 * Get the devices's port parent identifier
8813 int dev_get_port_parent_id(struct net_device
*dev
,
8814 struct netdev_phys_item_id
*ppid
,
8817 const struct net_device_ops
*ops
= dev
->netdev_ops
;
8818 struct netdev_phys_item_id first
= { };
8819 struct net_device
*lower_dev
;
8820 struct list_head
*iter
;
8823 if (ops
->ndo_get_port_parent_id
) {
8824 err
= ops
->ndo_get_port_parent_id(dev
, ppid
);
8825 if (err
!= -EOPNOTSUPP
)
8829 err
= devlink_compat_switch_id_get(dev
, ppid
);
8830 if (!err
|| err
!= -EOPNOTSUPP
)
8836 netdev_for_each_lower_dev(dev
, lower_dev
, iter
) {
8837 err
= dev_get_port_parent_id(lower_dev
, ppid
, recurse
);
8842 else if (memcmp(&first
, ppid
, sizeof(*ppid
)))
8848 EXPORT_SYMBOL(dev_get_port_parent_id
);
8851 * netdev_port_same_parent_id - Indicate if two network devices have
8852 * the same port parent identifier
8853 * @a: first network device
8854 * @b: second network device
8856 bool netdev_port_same_parent_id(struct net_device
*a
, struct net_device
*b
)
8858 struct netdev_phys_item_id a_id
= { };
8859 struct netdev_phys_item_id b_id
= { };
8861 if (dev_get_port_parent_id(a
, &a_id
, true) ||
8862 dev_get_port_parent_id(b
, &b_id
, true))
8865 return netdev_phys_item_id_same(&a_id
, &b_id
);
8867 EXPORT_SYMBOL(netdev_port_same_parent_id
);
8870 * dev_change_proto_down - update protocol port state information
8872 * @proto_down: new value
8874 * This info can be used by switch drivers to set the phys state of the
8877 int dev_change_proto_down(struct net_device
*dev
, bool proto_down
)
8879 const struct net_device_ops
*ops
= dev
->netdev_ops
;
8881 if (!ops
->ndo_change_proto_down
)
8883 if (!netif_device_present(dev
))
8885 return ops
->ndo_change_proto_down(dev
, proto_down
);
8887 EXPORT_SYMBOL(dev_change_proto_down
);
8890 * dev_change_proto_down_generic - generic implementation for
8891 * ndo_change_proto_down that sets carrier according to
8895 * @proto_down: new value
8897 int dev_change_proto_down_generic(struct net_device
*dev
, bool proto_down
)
8900 netif_carrier_off(dev
);
8902 netif_carrier_on(dev
);
8903 dev
->proto_down
= proto_down
;
8906 EXPORT_SYMBOL(dev_change_proto_down_generic
);
8909 * dev_change_proto_down_reason - proto down reason
8912 * @mask: proto down mask
8913 * @value: proto down value
8915 void dev_change_proto_down_reason(struct net_device
*dev
, unsigned long mask
,
8921 dev
->proto_down_reason
= value
;
8923 for_each_set_bit(b
, &mask
, 32) {
8924 if (value
& (1 << b
))
8925 dev
->proto_down_reason
|= BIT(b
);
8927 dev
->proto_down_reason
&= ~BIT(b
);
8931 EXPORT_SYMBOL(dev_change_proto_down_reason
);
8933 struct bpf_xdp_link
{
8934 struct bpf_link link
;
8935 struct net_device
*dev
; /* protected by rtnl_lock, no refcnt held */
8939 static enum bpf_xdp_mode
dev_xdp_mode(struct net_device
*dev
, u32 flags
)
8941 if (flags
& XDP_FLAGS_HW_MODE
)
8943 if (flags
& XDP_FLAGS_DRV_MODE
)
8944 return XDP_MODE_DRV
;
8945 if (flags
& XDP_FLAGS_SKB_MODE
)
8946 return XDP_MODE_SKB
;
8947 return dev
->netdev_ops
->ndo_bpf
? XDP_MODE_DRV
: XDP_MODE_SKB
;
8950 static bpf_op_t
dev_xdp_bpf_op(struct net_device
*dev
, enum bpf_xdp_mode mode
)
8954 return generic_xdp_install
;
8957 return dev
->netdev_ops
->ndo_bpf
;
8963 static struct bpf_xdp_link
*dev_xdp_link(struct net_device
*dev
,
8964 enum bpf_xdp_mode mode
)
8966 return dev
->xdp_state
[mode
].link
;
8969 static struct bpf_prog
*dev_xdp_prog(struct net_device
*dev
,
8970 enum bpf_xdp_mode mode
)
8972 struct bpf_xdp_link
*link
= dev_xdp_link(dev
, mode
);
8975 return link
->link
.prog
;
8976 return dev
->xdp_state
[mode
].prog
;
8979 static u8
dev_xdp_prog_count(struct net_device
*dev
)
8984 for (i
= 0; i
< __MAX_XDP_MODE
; i
++)
8985 if (dev
->xdp_state
[i
].prog
|| dev
->xdp_state
[i
].link
)
8990 u32
dev_xdp_prog_id(struct net_device
*dev
, enum bpf_xdp_mode mode
)
8992 struct bpf_prog
*prog
= dev_xdp_prog(dev
, mode
);
8994 return prog
? prog
->aux
->id
: 0;
8997 static void dev_xdp_set_link(struct net_device
*dev
, enum bpf_xdp_mode mode
,
8998 struct bpf_xdp_link
*link
)
9000 dev
->xdp_state
[mode
].link
= link
;
9001 dev
->xdp_state
[mode
].prog
= NULL
;
9004 static void dev_xdp_set_prog(struct net_device
*dev
, enum bpf_xdp_mode mode
,
9005 struct bpf_prog
*prog
)
9007 dev
->xdp_state
[mode
].link
= NULL
;
9008 dev
->xdp_state
[mode
].prog
= prog
;
9011 static int dev_xdp_install(struct net_device
*dev
, enum bpf_xdp_mode mode
,
9012 bpf_op_t bpf_op
, struct netlink_ext_ack
*extack
,
9013 u32 flags
, struct bpf_prog
*prog
)
9015 struct netdev_bpf xdp
;
9018 memset(&xdp
, 0, sizeof(xdp
));
9019 xdp
.command
= mode
== XDP_MODE_HW
? XDP_SETUP_PROG_HW
: XDP_SETUP_PROG
;
9020 xdp
.extack
= extack
;
9024 /* Drivers assume refcnt is already incremented (i.e, prog pointer is
9025 * "moved" into driver), so they don't increment it on their own, but
9026 * they do decrement refcnt when program is detached or replaced.
9027 * Given net_device also owns link/prog, we need to bump refcnt here
9028 * to prevent drivers from underflowing it.
9032 err
= bpf_op(dev
, &xdp
);
9039 if (mode
!= XDP_MODE_HW
)
9040 bpf_prog_change_xdp(dev_xdp_prog(dev
, mode
), prog
);
9045 static void dev_xdp_uninstall(struct net_device
*dev
)
9047 struct bpf_xdp_link
*link
;
9048 struct bpf_prog
*prog
;
9049 enum bpf_xdp_mode mode
;
9054 for (mode
= XDP_MODE_SKB
; mode
< __MAX_XDP_MODE
; mode
++) {
9055 prog
= dev_xdp_prog(dev
, mode
);
9059 bpf_op
= dev_xdp_bpf_op(dev
, mode
);
9063 WARN_ON(dev_xdp_install(dev
, mode
, bpf_op
, NULL
, 0, NULL
));
9065 /* auto-detach link from net device */
9066 link
= dev_xdp_link(dev
, mode
);
9072 dev_xdp_set_link(dev
, mode
, NULL
);
9076 static int dev_xdp_attach(struct net_device
*dev
, struct netlink_ext_ack
*extack
,
9077 struct bpf_xdp_link
*link
, struct bpf_prog
*new_prog
,
9078 struct bpf_prog
*old_prog
, u32 flags
)
9080 unsigned int num_modes
= hweight32(flags
& XDP_FLAGS_MODES
);
9081 struct bpf_prog
*cur_prog
;
9082 enum bpf_xdp_mode mode
;
9088 /* either link or prog attachment, never both */
9089 if (link
&& (new_prog
|| old_prog
))
9091 /* link supports only XDP mode flags */
9092 if (link
&& (flags
& ~XDP_FLAGS_MODES
)) {
9093 NL_SET_ERR_MSG(extack
, "Invalid XDP flags for BPF link attachment");
9096 /* just one XDP mode bit should be set, zero defaults to drv/skb mode */
9097 if (num_modes
> 1) {
9098 NL_SET_ERR_MSG(extack
, "Only one XDP mode flag can be set");
9101 /* avoid ambiguity if offload + drv/skb mode progs are both loaded */
9102 if (!num_modes
&& dev_xdp_prog_count(dev
) > 1) {
9103 NL_SET_ERR_MSG(extack
,
9104 "More than one program loaded, unset mode is ambiguous");
9107 /* old_prog != NULL implies XDP_FLAGS_REPLACE is set */
9108 if (old_prog
&& !(flags
& XDP_FLAGS_REPLACE
)) {
9109 NL_SET_ERR_MSG(extack
, "XDP_FLAGS_REPLACE is not specified");
9113 mode
= dev_xdp_mode(dev
, flags
);
9114 /* can't replace attached link */
9115 if (dev_xdp_link(dev
, mode
)) {
9116 NL_SET_ERR_MSG(extack
, "Can't replace active BPF XDP link");
9120 cur_prog
= dev_xdp_prog(dev
, mode
);
9121 /* can't replace attached prog with link */
9122 if (link
&& cur_prog
) {
9123 NL_SET_ERR_MSG(extack
, "Can't replace active XDP program with BPF link");
9126 if ((flags
& XDP_FLAGS_REPLACE
) && cur_prog
!= old_prog
) {
9127 NL_SET_ERR_MSG(extack
, "Active program does not match expected");
9131 /* put effective new program into new_prog */
9133 new_prog
= link
->link
.prog
;
9136 bool offload
= mode
== XDP_MODE_HW
;
9137 enum bpf_xdp_mode other_mode
= mode
== XDP_MODE_SKB
9138 ? XDP_MODE_DRV
: XDP_MODE_SKB
;
9140 if ((flags
& XDP_FLAGS_UPDATE_IF_NOEXIST
) && cur_prog
) {
9141 NL_SET_ERR_MSG(extack
, "XDP program already attached");
9144 if (!offload
&& dev_xdp_prog(dev
, other_mode
)) {
9145 NL_SET_ERR_MSG(extack
, "Native and generic XDP can't be active at the same time");
9148 if (!offload
&& bpf_prog_is_dev_bound(new_prog
->aux
)) {
9149 NL_SET_ERR_MSG(extack
, "Using device-bound program without HW_MODE flag is not supported");
9152 if (new_prog
->expected_attach_type
== BPF_XDP_DEVMAP
) {
9153 NL_SET_ERR_MSG(extack
, "BPF_XDP_DEVMAP programs can not be attached to a device");
9156 if (new_prog
->expected_attach_type
== BPF_XDP_CPUMAP
) {
9157 NL_SET_ERR_MSG(extack
, "BPF_XDP_CPUMAP programs can not be attached to a device");
9162 /* don't call drivers if the effective program didn't change */
9163 if (new_prog
!= cur_prog
) {
9164 bpf_op
= dev_xdp_bpf_op(dev
, mode
);
9166 NL_SET_ERR_MSG(extack
, "Underlying driver does not support XDP in native mode");
9170 err
= dev_xdp_install(dev
, mode
, bpf_op
, extack
, flags
, new_prog
);
9176 dev_xdp_set_link(dev
, mode
, link
);
9178 dev_xdp_set_prog(dev
, mode
, new_prog
);
9180 bpf_prog_put(cur_prog
);
9185 static int dev_xdp_attach_link(struct net_device
*dev
,
9186 struct netlink_ext_ack
*extack
,
9187 struct bpf_xdp_link
*link
)
9189 return dev_xdp_attach(dev
, extack
, link
, NULL
, NULL
, link
->flags
);
9192 static int dev_xdp_detach_link(struct net_device
*dev
,
9193 struct netlink_ext_ack
*extack
,
9194 struct bpf_xdp_link
*link
)
9196 enum bpf_xdp_mode mode
;
9201 mode
= dev_xdp_mode(dev
, link
->flags
);
9202 if (dev_xdp_link(dev
, mode
) != link
)
9205 bpf_op
= dev_xdp_bpf_op(dev
, mode
);
9206 WARN_ON(dev_xdp_install(dev
, mode
, bpf_op
, NULL
, 0, NULL
));
9207 dev_xdp_set_link(dev
, mode
, NULL
);
9211 static void bpf_xdp_link_release(struct bpf_link
*link
)
9213 struct bpf_xdp_link
*xdp_link
= container_of(link
, struct bpf_xdp_link
, link
);
9217 /* if racing with net_device's tear down, xdp_link->dev might be
9218 * already NULL, in which case link was already auto-detached
9220 if (xdp_link
->dev
) {
9221 WARN_ON(dev_xdp_detach_link(xdp_link
->dev
, NULL
, xdp_link
));
9222 xdp_link
->dev
= NULL
;
9228 static int bpf_xdp_link_detach(struct bpf_link
*link
)
9230 bpf_xdp_link_release(link
);
9234 static void bpf_xdp_link_dealloc(struct bpf_link
*link
)
9236 struct bpf_xdp_link
*xdp_link
= container_of(link
, struct bpf_xdp_link
, link
);
9241 static void bpf_xdp_link_show_fdinfo(const struct bpf_link
*link
,
9242 struct seq_file
*seq
)
9244 struct bpf_xdp_link
*xdp_link
= container_of(link
, struct bpf_xdp_link
, link
);
9249 ifindex
= xdp_link
->dev
->ifindex
;
9252 seq_printf(seq
, "ifindex:\t%u\n", ifindex
);
9255 static int bpf_xdp_link_fill_link_info(const struct bpf_link
*link
,
9256 struct bpf_link_info
*info
)
9258 struct bpf_xdp_link
*xdp_link
= container_of(link
, struct bpf_xdp_link
, link
);
9263 ifindex
= xdp_link
->dev
->ifindex
;
9266 info
->xdp
.ifindex
= ifindex
;
9270 static int bpf_xdp_link_update(struct bpf_link
*link
, struct bpf_prog
*new_prog
,
9271 struct bpf_prog
*old_prog
)
9273 struct bpf_xdp_link
*xdp_link
= container_of(link
, struct bpf_xdp_link
, link
);
9274 enum bpf_xdp_mode mode
;
9280 /* link might have been auto-released already, so fail */
9281 if (!xdp_link
->dev
) {
9286 if (old_prog
&& link
->prog
!= old_prog
) {
9290 old_prog
= link
->prog
;
9291 if (old_prog
== new_prog
) {
9292 /* no-op, don't disturb drivers */
9293 bpf_prog_put(new_prog
);
9297 mode
= dev_xdp_mode(xdp_link
->dev
, xdp_link
->flags
);
9298 bpf_op
= dev_xdp_bpf_op(xdp_link
->dev
, mode
);
9299 err
= dev_xdp_install(xdp_link
->dev
, mode
, bpf_op
, NULL
,
9300 xdp_link
->flags
, new_prog
);
9304 old_prog
= xchg(&link
->prog
, new_prog
);
9305 bpf_prog_put(old_prog
);
9312 static const struct bpf_link_ops bpf_xdp_link_lops
= {
9313 .release
= bpf_xdp_link_release
,
9314 .dealloc
= bpf_xdp_link_dealloc
,
9315 .detach
= bpf_xdp_link_detach
,
9316 .show_fdinfo
= bpf_xdp_link_show_fdinfo
,
9317 .fill_link_info
= bpf_xdp_link_fill_link_info
,
9318 .update_prog
= bpf_xdp_link_update
,
9321 int bpf_xdp_link_attach(const union bpf_attr
*attr
, struct bpf_prog
*prog
)
9323 struct net
*net
= current
->nsproxy
->net_ns
;
9324 struct bpf_link_primer link_primer
;
9325 struct bpf_xdp_link
*link
;
9326 struct net_device
*dev
;
9329 dev
= dev_get_by_index(net
, attr
->link_create
.target_ifindex
);
9333 link
= kzalloc(sizeof(*link
), GFP_USER
);
9339 bpf_link_init(&link
->link
, BPF_LINK_TYPE_XDP
, &bpf_xdp_link_lops
, prog
);
9341 link
->flags
= attr
->link_create
.flags
;
9343 err
= bpf_link_prime(&link
->link
, &link_primer
);
9350 err
= dev_xdp_attach_link(dev
, NULL
, link
);
9354 bpf_link_cleanup(&link_primer
);
9358 fd
= bpf_link_settle(&link_primer
);
9359 /* link itself doesn't hold dev's refcnt to not complicate shutdown */
9369 * dev_change_xdp_fd - set or clear a bpf program for a device rx path
9371 * @extack: netlink extended ack
9372 * @fd: new program fd or negative value to clear
9373 * @expected_fd: old program fd that userspace expects to replace or clear
9374 * @flags: xdp-related flags
9376 * Set or clear a bpf program for a device
9378 int dev_change_xdp_fd(struct net_device
*dev
, struct netlink_ext_ack
*extack
,
9379 int fd
, int expected_fd
, u32 flags
)
9381 enum bpf_xdp_mode mode
= dev_xdp_mode(dev
, flags
);
9382 struct bpf_prog
*new_prog
= NULL
, *old_prog
= NULL
;
9388 new_prog
= bpf_prog_get_type_dev(fd
, BPF_PROG_TYPE_XDP
,
9389 mode
!= XDP_MODE_SKB
);
9390 if (IS_ERR(new_prog
))
9391 return PTR_ERR(new_prog
);
9394 if (expected_fd
>= 0) {
9395 old_prog
= bpf_prog_get_type_dev(expected_fd
, BPF_PROG_TYPE_XDP
,
9396 mode
!= XDP_MODE_SKB
);
9397 if (IS_ERR(old_prog
)) {
9398 err
= PTR_ERR(old_prog
);
9404 err
= dev_xdp_attach(dev
, extack
, NULL
, new_prog
, old_prog
, flags
);
9407 if (err
&& new_prog
)
9408 bpf_prog_put(new_prog
);
9410 bpf_prog_put(old_prog
);
9415 * dev_new_index - allocate an ifindex
9416 * @net: the applicable net namespace
9418 * Returns a suitable unique value for a new device interface
9419 * number. The caller must hold the rtnl semaphore or the
9420 * dev_base_lock to be sure it remains unique.
9422 static int dev_new_index(struct net
*net
)
9424 int ifindex
= net
->ifindex
;
9429 if (!__dev_get_by_index(net
, ifindex
))
9430 return net
->ifindex
= ifindex
;
9434 /* Delayed registration/unregisteration */
9435 static LIST_HEAD(net_todo_list
);
9436 DECLARE_WAIT_QUEUE_HEAD(netdev_unregistering_wq
);
9438 static void net_set_todo(struct net_device
*dev
)
9440 list_add_tail(&dev
->todo_list
, &net_todo_list
);
9441 dev_net(dev
)->dev_unreg_count
++;
9444 static void rollback_registered_many(struct list_head
*head
)
9446 struct net_device
*dev
, *tmp
;
9447 LIST_HEAD(close_head
);
9449 BUG_ON(dev_boot_phase
);
9452 list_for_each_entry_safe(dev
, tmp
, head
, unreg_list
) {
9453 /* Some devices call without registering
9454 * for initialization unwind. Remove those
9455 * devices and proceed with the remaining.
9457 if (dev
->reg_state
== NETREG_UNINITIALIZED
) {
9458 pr_debug("unregister_netdevice: device %s/%p never was registered\n",
9462 list_del(&dev
->unreg_list
);
9465 dev
->dismantle
= true;
9466 BUG_ON(dev
->reg_state
!= NETREG_REGISTERED
);
9469 /* If device is running, close it first. */
9470 list_for_each_entry(dev
, head
, unreg_list
)
9471 list_add_tail(&dev
->close_list
, &close_head
);
9472 dev_close_many(&close_head
, true);
9474 list_for_each_entry(dev
, head
, unreg_list
) {
9475 /* And unlink it from device chain. */
9476 unlist_netdevice(dev
);
9478 dev
->reg_state
= NETREG_UNREGISTERING
;
9480 flush_all_backlogs();
9484 list_for_each_entry(dev
, head
, unreg_list
) {
9485 struct sk_buff
*skb
= NULL
;
9487 /* Shutdown queueing discipline. */
9490 dev_xdp_uninstall(dev
);
9492 /* Notify protocols, that we are about to destroy
9493 * this device. They should clean all the things.
9495 call_netdevice_notifiers(NETDEV_UNREGISTER
, dev
);
9497 if (!dev
->rtnl_link_ops
||
9498 dev
->rtnl_link_state
== RTNL_LINK_INITIALIZED
)
9499 skb
= rtmsg_ifinfo_build_skb(RTM_DELLINK
, dev
, ~0U, 0,
9500 GFP_KERNEL
, NULL
, 0);
9503 * Flush the unicast and multicast chains
9508 netdev_name_node_alt_flush(dev
);
9509 netdev_name_node_free(dev
->name_node
);
9511 if (dev
->netdev_ops
->ndo_uninit
)
9512 dev
->netdev_ops
->ndo_uninit(dev
);
9515 rtmsg_ifinfo_send(skb
, dev
, GFP_KERNEL
);
9517 /* Notifier chain MUST detach us all upper devices. */
9518 WARN_ON(netdev_has_any_upper_dev(dev
));
9519 WARN_ON(netdev_has_any_lower_dev(dev
));
9521 /* Remove entries from kobject tree */
9522 netdev_unregister_kobject(dev
);
9524 /* Remove XPS queueing entries */
9525 netif_reset_xps_queues_gt(dev
, 0);
9531 list_for_each_entry(dev
, head
, unreg_list
)
9535 static void rollback_registered(struct net_device
*dev
)
9539 list_add(&dev
->unreg_list
, &single
);
9540 rollback_registered_many(&single
);
9544 static netdev_features_t
netdev_sync_upper_features(struct net_device
*lower
,
9545 struct net_device
*upper
, netdev_features_t features
)
9547 netdev_features_t upper_disables
= NETIF_F_UPPER_DISABLES
;
9548 netdev_features_t feature
;
9551 for_each_netdev_feature(upper_disables
, feature_bit
) {
9552 feature
= __NETIF_F_BIT(feature_bit
);
9553 if (!(upper
->wanted_features
& feature
)
9554 && (features
& feature
)) {
9555 netdev_dbg(lower
, "Dropping feature %pNF, upper dev %s has it off.\n",
9556 &feature
, upper
->name
);
9557 features
&= ~feature
;
9564 static void netdev_sync_lower_features(struct net_device
*upper
,
9565 struct net_device
*lower
, netdev_features_t features
)
9567 netdev_features_t upper_disables
= NETIF_F_UPPER_DISABLES
;
9568 netdev_features_t feature
;
9571 for_each_netdev_feature(upper_disables
, feature_bit
) {
9572 feature
= __NETIF_F_BIT(feature_bit
);
9573 if (!(features
& feature
) && (lower
->features
& feature
)) {
9574 netdev_dbg(upper
, "Disabling feature %pNF on lower dev %s.\n",
9575 &feature
, lower
->name
);
9576 lower
->wanted_features
&= ~feature
;
9577 __netdev_update_features(lower
);
9579 if (unlikely(lower
->features
& feature
))
9580 netdev_WARN(upper
, "failed to disable %pNF on %s!\n",
9581 &feature
, lower
->name
);
9583 netdev_features_change(lower
);
9588 static netdev_features_t
netdev_fix_features(struct net_device
*dev
,
9589 netdev_features_t features
)
9591 /* Fix illegal checksum combinations */
9592 if ((features
& NETIF_F_HW_CSUM
) &&
9593 (features
& (NETIF_F_IP_CSUM
|NETIF_F_IPV6_CSUM
))) {
9594 netdev_warn(dev
, "mixed HW and IP checksum settings.\n");
9595 features
&= ~(NETIF_F_IP_CSUM
|NETIF_F_IPV6_CSUM
);
9598 /* TSO requires that SG is present as well. */
9599 if ((features
& NETIF_F_ALL_TSO
) && !(features
& NETIF_F_SG
)) {
9600 netdev_dbg(dev
, "Dropping TSO features since no SG feature.\n");
9601 features
&= ~NETIF_F_ALL_TSO
;
9604 if ((features
& NETIF_F_TSO
) && !(features
& NETIF_F_HW_CSUM
) &&
9605 !(features
& NETIF_F_IP_CSUM
)) {
9606 netdev_dbg(dev
, "Dropping TSO features since no CSUM feature.\n");
9607 features
&= ~NETIF_F_TSO
;
9608 features
&= ~NETIF_F_TSO_ECN
;
9611 if ((features
& NETIF_F_TSO6
) && !(features
& NETIF_F_HW_CSUM
) &&
9612 !(features
& NETIF_F_IPV6_CSUM
)) {
9613 netdev_dbg(dev
, "Dropping TSO6 features since no CSUM feature.\n");
9614 features
&= ~NETIF_F_TSO6
;
9617 /* TSO with IPv4 ID mangling requires IPv4 TSO be enabled */
9618 if ((features
& NETIF_F_TSO_MANGLEID
) && !(features
& NETIF_F_TSO
))
9619 features
&= ~NETIF_F_TSO_MANGLEID
;
9621 /* TSO ECN requires that TSO is present as well. */
9622 if ((features
& NETIF_F_ALL_TSO
) == NETIF_F_TSO_ECN
)
9623 features
&= ~NETIF_F_TSO_ECN
;
9625 /* Software GSO depends on SG. */
9626 if ((features
& NETIF_F_GSO
) && !(features
& NETIF_F_SG
)) {
9627 netdev_dbg(dev
, "Dropping NETIF_F_GSO since no SG feature.\n");
9628 features
&= ~NETIF_F_GSO
;
9631 /* GSO partial features require GSO partial be set */
9632 if ((features
& dev
->gso_partial_features
) &&
9633 !(features
& NETIF_F_GSO_PARTIAL
)) {
9635 "Dropping partially supported GSO features since no GSO partial.\n");
9636 features
&= ~dev
->gso_partial_features
;
9639 if (!(features
& NETIF_F_RXCSUM
)) {
9640 /* NETIF_F_GRO_HW implies doing RXCSUM since every packet
9641 * successfully merged by hardware must also have the
9642 * checksum verified by hardware. If the user does not
9643 * want to enable RXCSUM, logically, we should disable GRO_HW.
9645 if (features
& NETIF_F_GRO_HW
) {
9646 netdev_dbg(dev
, "Dropping NETIF_F_GRO_HW since no RXCSUM feature.\n");
9647 features
&= ~NETIF_F_GRO_HW
;
9651 /* LRO/HW-GRO features cannot be combined with RX-FCS */
9652 if (features
& NETIF_F_RXFCS
) {
9653 if (features
& NETIF_F_LRO
) {
9654 netdev_dbg(dev
, "Dropping LRO feature since RX-FCS is requested.\n");
9655 features
&= ~NETIF_F_LRO
;
9658 if (features
& NETIF_F_GRO_HW
) {
9659 netdev_dbg(dev
, "Dropping HW-GRO feature since RX-FCS is requested.\n");
9660 features
&= ~NETIF_F_GRO_HW
;
9664 if ((features
& NETIF_F_HW_TLS_TX
) && !(features
& NETIF_F_HW_CSUM
)) {
9665 netdev_dbg(dev
, "Dropping TLS TX HW offload feature since no CSUM feature.\n");
9666 features
&= ~NETIF_F_HW_TLS_TX
;
9672 int __netdev_update_features(struct net_device
*dev
)
9674 struct net_device
*upper
, *lower
;
9675 netdev_features_t features
;
9676 struct list_head
*iter
;
9681 features
= netdev_get_wanted_features(dev
);
9683 if (dev
->netdev_ops
->ndo_fix_features
)
9684 features
= dev
->netdev_ops
->ndo_fix_features(dev
, features
);
9686 /* driver might be less strict about feature dependencies */
9687 features
= netdev_fix_features(dev
, features
);
9689 /* some features can't be enabled if they're off on an upper device */
9690 netdev_for_each_upper_dev_rcu(dev
, upper
, iter
)
9691 features
= netdev_sync_upper_features(dev
, upper
, features
);
9693 if (dev
->features
== features
)
9696 netdev_dbg(dev
, "Features changed: %pNF -> %pNF\n",
9697 &dev
->features
, &features
);
9699 if (dev
->netdev_ops
->ndo_set_features
)
9700 err
= dev
->netdev_ops
->ndo_set_features(dev
, features
);
9704 if (unlikely(err
< 0)) {
9706 "set_features() failed (%d); wanted %pNF, left %pNF\n",
9707 err
, &features
, &dev
->features
);
9708 /* return non-0 since some features might have changed and
9709 * it's better to fire a spurious notification than miss it
9715 /* some features must be disabled on lower devices when disabled
9716 * on an upper device (think: bonding master or bridge)
9718 netdev_for_each_lower_dev(dev
, lower
, iter
)
9719 netdev_sync_lower_features(dev
, lower
, features
);
9722 netdev_features_t diff
= features
^ dev
->features
;
9724 if (diff
& NETIF_F_RX_UDP_TUNNEL_PORT
) {
9725 /* udp_tunnel_{get,drop}_rx_info both need
9726 * NETIF_F_RX_UDP_TUNNEL_PORT enabled on the
9727 * device, or they won't do anything.
9728 * Thus we need to update dev->features
9729 * *before* calling udp_tunnel_get_rx_info,
9730 * but *after* calling udp_tunnel_drop_rx_info.
9732 if (features
& NETIF_F_RX_UDP_TUNNEL_PORT
) {
9733 dev
->features
= features
;
9734 udp_tunnel_get_rx_info(dev
);
9736 udp_tunnel_drop_rx_info(dev
);
9740 if (diff
& NETIF_F_HW_VLAN_CTAG_FILTER
) {
9741 if (features
& NETIF_F_HW_VLAN_CTAG_FILTER
) {
9742 dev
->features
= features
;
9743 err
|= vlan_get_rx_ctag_filter_info(dev
);
9745 vlan_drop_rx_ctag_filter_info(dev
);
9749 if (diff
& NETIF_F_HW_VLAN_STAG_FILTER
) {
9750 if (features
& NETIF_F_HW_VLAN_STAG_FILTER
) {
9751 dev
->features
= features
;
9752 err
|= vlan_get_rx_stag_filter_info(dev
);
9754 vlan_drop_rx_stag_filter_info(dev
);
9758 dev
->features
= features
;
9761 return err
< 0 ? 0 : 1;
9765 * netdev_update_features - recalculate device features
9766 * @dev: the device to check
9768 * Recalculate dev->features set and send notifications if it
9769 * has changed. Should be called after driver or hardware dependent
9770 * conditions might have changed that influence the features.
9772 void netdev_update_features(struct net_device
*dev
)
9774 if (__netdev_update_features(dev
))
9775 netdev_features_change(dev
);
9777 EXPORT_SYMBOL(netdev_update_features
);
9780 * netdev_change_features - recalculate device features
9781 * @dev: the device to check
9783 * Recalculate dev->features set and send notifications even
9784 * if they have not changed. Should be called instead of
9785 * netdev_update_features() if also dev->vlan_features might
9786 * have changed to allow the changes to be propagated to stacked
9789 void netdev_change_features(struct net_device
*dev
)
9791 __netdev_update_features(dev
);
9792 netdev_features_change(dev
);
9794 EXPORT_SYMBOL(netdev_change_features
);
9797 * netif_stacked_transfer_operstate - transfer operstate
9798 * @rootdev: the root or lower level device to transfer state from
9799 * @dev: the device to transfer operstate to
9801 * Transfer operational state from root to device. This is normally
9802 * called when a stacking relationship exists between the root
9803 * device and the device(a leaf device).
9805 void netif_stacked_transfer_operstate(const struct net_device
*rootdev
,
9806 struct net_device
*dev
)
9808 if (rootdev
->operstate
== IF_OPER_DORMANT
)
9809 netif_dormant_on(dev
);
9811 netif_dormant_off(dev
);
9813 if (rootdev
->operstate
== IF_OPER_TESTING
)
9814 netif_testing_on(dev
);
9816 netif_testing_off(dev
);
9818 if (netif_carrier_ok(rootdev
))
9819 netif_carrier_on(dev
);
9821 netif_carrier_off(dev
);
9823 EXPORT_SYMBOL(netif_stacked_transfer_operstate
);
9825 static int netif_alloc_rx_queues(struct net_device
*dev
)
9827 unsigned int i
, count
= dev
->num_rx_queues
;
9828 struct netdev_rx_queue
*rx
;
9829 size_t sz
= count
* sizeof(*rx
);
9834 rx
= kvzalloc(sz
, GFP_KERNEL
| __GFP_RETRY_MAYFAIL
);
9840 for (i
= 0; i
< count
; i
++) {
9843 /* XDP RX-queue setup */
9844 err
= xdp_rxq_info_reg(&rx
[i
].xdp_rxq
, dev
, i
, 0);
9851 /* Rollback successful reg's and free other resources */
9853 xdp_rxq_info_unreg(&rx
[i
].xdp_rxq
);
9859 static void netif_free_rx_queues(struct net_device
*dev
)
9861 unsigned int i
, count
= dev
->num_rx_queues
;
9863 /* netif_alloc_rx_queues alloc failed, resources have been unreg'ed */
9867 for (i
= 0; i
< count
; i
++)
9868 xdp_rxq_info_unreg(&dev
->_rx
[i
].xdp_rxq
);
9873 static void netdev_init_one_queue(struct net_device
*dev
,
9874 struct netdev_queue
*queue
, void *_unused
)
9876 /* Initialize queue lock */
9877 spin_lock_init(&queue
->_xmit_lock
);
9878 netdev_set_xmit_lockdep_class(&queue
->_xmit_lock
, dev
->type
);
9879 queue
->xmit_lock_owner
= -1;
9880 netdev_queue_numa_node_write(queue
, NUMA_NO_NODE
);
9883 dql_init(&queue
->dql
, HZ
);
9887 static void netif_free_tx_queues(struct net_device
*dev
)
9892 static int netif_alloc_netdev_queues(struct net_device
*dev
)
9894 unsigned int count
= dev
->num_tx_queues
;
9895 struct netdev_queue
*tx
;
9896 size_t sz
= count
* sizeof(*tx
);
9898 if (count
< 1 || count
> 0xffff)
9901 tx
= kvzalloc(sz
, GFP_KERNEL
| __GFP_RETRY_MAYFAIL
);
9907 netdev_for_each_tx_queue(dev
, netdev_init_one_queue
, NULL
);
9908 spin_lock_init(&dev
->tx_global_lock
);
9913 void netif_tx_stop_all_queues(struct net_device
*dev
)
9917 for (i
= 0; i
< dev
->num_tx_queues
; i
++) {
9918 struct netdev_queue
*txq
= netdev_get_tx_queue(dev
, i
);
9920 netif_tx_stop_queue(txq
);
9923 EXPORT_SYMBOL(netif_tx_stop_all_queues
);
9926 * register_netdevice - register a network device
9927 * @dev: device to register
9929 * Take a completed network device structure and add it to the kernel
9930 * interfaces. A %NETDEV_REGISTER message is sent to the netdev notifier
9931 * chain. 0 is returned on success. A negative errno code is returned
9932 * on a failure to set up the device, or if the name is a duplicate.
9934 * Callers must hold the rtnl semaphore. You may want
9935 * register_netdev() instead of this.
9938 * The locking appears insufficient to guarantee two parallel registers
9939 * will not get the same name.
9942 int register_netdevice(struct net_device
*dev
)
9945 struct net
*net
= dev_net(dev
);
9947 BUILD_BUG_ON(sizeof(netdev_features_t
) * BITS_PER_BYTE
<
9948 NETDEV_FEATURE_COUNT
);
9949 BUG_ON(dev_boot_phase
);
9954 /* When net_device's are persistent, this will be fatal. */
9955 BUG_ON(dev
->reg_state
!= NETREG_UNINITIALIZED
);
9958 ret
= ethtool_check_ops(dev
->ethtool_ops
);
9962 spin_lock_init(&dev
->addr_list_lock
);
9963 netdev_set_addr_lockdep_class(dev
);
9965 ret
= dev_get_valid_name(net
, dev
, dev
->name
);
9970 dev
->name_node
= netdev_name_node_head_alloc(dev
);
9971 if (!dev
->name_node
)
9974 /* Init, if this function is available */
9975 if (dev
->netdev_ops
->ndo_init
) {
9976 ret
= dev
->netdev_ops
->ndo_init(dev
);
9984 if (((dev
->hw_features
| dev
->features
) &
9985 NETIF_F_HW_VLAN_CTAG_FILTER
) &&
9986 (!dev
->netdev_ops
->ndo_vlan_rx_add_vid
||
9987 !dev
->netdev_ops
->ndo_vlan_rx_kill_vid
)) {
9988 netdev_WARN(dev
, "Buggy VLAN acceleration in driver!\n");
9995 dev
->ifindex
= dev_new_index(net
);
9996 else if (__dev_get_by_index(net
, dev
->ifindex
))
9999 /* Transfer changeable features to wanted_features and enable
10000 * software offloads (GSO and GRO).
10002 dev
->hw_features
|= (NETIF_F_SOFT_FEATURES
| NETIF_F_SOFT_FEATURES_OFF
);
10003 dev
->features
|= NETIF_F_SOFT_FEATURES
;
10005 if (dev
->netdev_ops
->ndo_udp_tunnel_add
) {
10006 dev
->features
|= NETIF_F_RX_UDP_TUNNEL_PORT
;
10007 dev
->hw_features
|= NETIF_F_RX_UDP_TUNNEL_PORT
;
10010 dev
->wanted_features
= dev
->features
& dev
->hw_features
;
10012 if (!(dev
->flags
& IFF_LOOPBACK
))
10013 dev
->hw_features
|= NETIF_F_NOCACHE_COPY
;
10015 /* If IPv4 TCP segmentation offload is supported we should also
10016 * allow the device to enable segmenting the frame with the option
10017 * of ignoring a static IP ID value. This doesn't enable the
10018 * feature itself but allows the user to enable it later.
10020 if (dev
->hw_features
& NETIF_F_TSO
)
10021 dev
->hw_features
|= NETIF_F_TSO_MANGLEID
;
10022 if (dev
->vlan_features
& NETIF_F_TSO
)
10023 dev
->vlan_features
|= NETIF_F_TSO_MANGLEID
;
10024 if (dev
->mpls_features
& NETIF_F_TSO
)
10025 dev
->mpls_features
|= NETIF_F_TSO_MANGLEID
;
10026 if (dev
->hw_enc_features
& NETIF_F_TSO
)
10027 dev
->hw_enc_features
|= NETIF_F_TSO_MANGLEID
;
10029 /* Make NETIF_F_HIGHDMA inheritable to VLAN devices.
10031 dev
->vlan_features
|= NETIF_F_HIGHDMA
;
10033 /* Make NETIF_F_SG inheritable to tunnel devices.
10035 dev
->hw_enc_features
|= NETIF_F_SG
| NETIF_F_GSO_PARTIAL
;
10037 /* Make NETIF_F_SG inheritable to MPLS.
10039 dev
->mpls_features
|= NETIF_F_SG
;
10041 ret
= call_netdevice_notifiers(NETDEV_POST_INIT
, dev
);
10042 ret
= notifier_to_errno(ret
);
10046 ret
= netdev_register_kobject(dev
);
10048 dev
->reg_state
= NETREG_UNREGISTERED
;
10051 dev
->reg_state
= NETREG_REGISTERED
;
10053 __netdev_update_features(dev
);
10056 * Default initial state at registry is that the
10057 * device is present.
10060 set_bit(__LINK_STATE_PRESENT
, &dev
->state
);
10062 linkwatch_init_dev(dev
);
10064 dev_init_scheduler(dev
);
10066 list_netdevice(dev
);
10067 add_device_randomness(dev
->dev_addr
, dev
->addr_len
);
10069 /* If the device has permanent device address, driver should
10070 * set dev_addr and also addr_assign_type should be set to
10071 * NET_ADDR_PERM (default value).
10073 if (dev
->addr_assign_type
== NET_ADDR_PERM
)
10074 memcpy(dev
->perm_addr
, dev
->dev_addr
, dev
->addr_len
);
10076 /* Notify protocols, that a new device appeared. */
10077 ret
= call_netdevice_notifiers(NETDEV_REGISTER
, dev
);
10078 ret
= notifier_to_errno(ret
);
10080 rollback_registered(dev
);
10083 dev
->reg_state
= NETREG_UNREGISTERED
;
10084 /* We should put the kobject that hold in
10085 * netdev_unregister_kobject(), otherwise
10086 * the net device cannot be freed when
10087 * driver calls free_netdev(), because the
10088 * kobject is being hold.
10090 kobject_put(&dev
->dev
.kobj
);
10093 * Prevent userspace races by waiting until the network
10094 * device is fully setup before sending notifications.
10096 if (!dev
->rtnl_link_ops
||
10097 dev
->rtnl_link_state
== RTNL_LINK_INITIALIZED
)
10098 rtmsg_ifinfo(RTM_NEWLINK
, dev
, ~0U, GFP_KERNEL
);
10104 if (dev
->netdev_ops
->ndo_uninit
)
10105 dev
->netdev_ops
->ndo_uninit(dev
);
10106 if (dev
->priv_destructor
)
10107 dev
->priv_destructor(dev
);
10109 netdev_name_node_free(dev
->name_node
);
10112 EXPORT_SYMBOL(register_netdevice
);
10115 * init_dummy_netdev - init a dummy network device for NAPI
10116 * @dev: device to init
10118 * This takes a network device structure and initialize the minimum
10119 * amount of fields so it can be used to schedule NAPI polls without
10120 * registering a full blown interface. This is to be used by drivers
10121 * that need to tie several hardware interfaces to a single NAPI
10122 * poll scheduler due to HW limitations.
10124 int init_dummy_netdev(struct net_device
*dev
)
10126 /* Clear everything. Note we don't initialize spinlocks
10127 * are they aren't supposed to be taken by any of the
10128 * NAPI code and this dummy netdev is supposed to be
10129 * only ever used for NAPI polls
10131 memset(dev
, 0, sizeof(struct net_device
));
10133 /* make sure we BUG if trying to hit standard
10134 * register/unregister code path
10136 dev
->reg_state
= NETREG_DUMMY
;
10138 /* NAPI wants this */
10139 INIT_LIST_HEAD(&dev
->napi_list
);
10141 /* a dummy interface is started by default */
10142 set_bit(__LINK_STATE_PRESENT
, &dev
->state
);
10143 set_bit(__LINK_STATE_START
, &dev
->state
);
10145 /* napi_busy_loop stats accounting wants this */
10146 dev_net_set(dev
, &init_net
);
10148 /* Note : We dont allocate pcpu_refcnt for dummy devices,
10149 * because users of this 'device' dont need to change
10155 EXPORT_SYMBOL_GPL(init_dummy_netdev
);
10159 * register_netdev - register a network device
10160 * @dev: device to register
10162 * Take a completed network device structure and add it to the kernel
10163 * interfaces. A %NETDEV_REGISTER message is sent to the netdev notifier
10164 * chain. 0 is returned on success. A negative errno code is returned
10165 * on a failure to set up the device, or if the name is a duplicate.
10167 * This is a wrapper around register_netdevice that takes the rtnl semaphore
10168 * and expands the device name if you passed a format string to
10171 int register_netdev(struct net_device
*dev
)
10175 if (rtnl_lock_killable())
10177 err
= register_netdevice(dev
);
10181 EXPORT_SYMBOL(register_netdev
);
10183 int netdev_refcnt_read(const struct net_device
*dev
)
10187 for_each_possible_cpu(i
)
10188 refcnt
+= *per_cpu_ptr(dev
->pcpu_refcnt
, i
);
10191 EXPORT_SYMBOL(netdev_refcnt_read
);
10193 #define WAIT_REFS_MIN_MSECS 1
10194 #define WAIT_REFS_MAX_MSECS 250
10196 * netdev_wait_allrefs - wait until all references are gone.
10197 * @dev: target net_device
10199 * This is called when unregistering network devices.
10201 * Any protocol or device that holds a reference should register
10202 * for netdevice notification, and cleanup and put back the
10203 * reference if they receive an UNREGISTER event.
10204 * We can get stuck here if buggy protocols don't correctly
10207 static void netdev_wait_allrefs(struct net_device
*dev
)
10209 unsigned long rebroadcast_time
, warning_time
;
10210 int wait
= 0, refcnt
;
10212 linkwatch_forget_dev(dev
);
10214 rebroadcast_time
= warning_time
= jiffies
;
10215 refcnt
= netdev_refcnt_read(dev
);
10217 while (refcnt
!= 0) {
10218 if (time_after(jiffies
, rebroadcast_time
+ 1 * HZ
)) {
10221 /* Rebroadcast unregister notification */
10222 call_netdevice_notifiers(NETDEV_UNREGISTER
, dev
);
10228 if (test_bit(__LINK_STATE_LINKWATCH_PENDING
,
10230 /* We must not have linkwatch events
10231 * pending on unregister. If this
10232 * happens, we simply run the queue
10233 * unscheduled, resulting in a noop
10236 linkwatch_run_queue();
10241 rebroadcast_time
= jiffies
;
10246 wait
= WAIT_REFS_MIN_MSECS
;
10249 wait
= min(wait
<< 1, WAIT_REFS_MAX_MSECS
);
10252 refcnt
= netdev_refcnt_read(dev
);
10254 if (refcnt
&& time_after(jiffies
, warning_time
+ 10 * HZ
)) {
10255 pr_emerg("unregister_netdevice: waiting for %s to become free. Usage count = %d\n",
10256 dev
->name
, refcnt
);
10257 warning_time
= jiffies
;
10262 /* The sequence is:
10266 * register_netdevice(x1);
10267 * register_netdevice(x2);
10269 * unregister_netdevice(y1);
10270 * unregister_netdevice(y2);
10276 * We are invoked by rtnl_unlock().
10277 * This allows us to deal with problems:
10278 * 1) We can delete sysfs objects which invoke hotplug
10279 * without deadlocking with linkwatch via keventd.
10280 * 2) Since we run with the RTNL semaphore not held, we can sleep
10281 * safely in order to wait for the netdev refcnt to drop to zero.
10283 * We must not return until all unregister events added during
10284 * the interval the lock was held have been completed.
10286 void netdev_run_todo(void)
10288 struct list_head list
;
10289 #ifdef CONFIG_LOCKDEP
10290 struct list_head unlink_list
;
10292 list_replace_init(&net_unlink_list
, &unlink_list
);
10294 while (!list_empty(&unlink_list
)) {
10295 struct net_device
*dev
= list_first_entry(&unlink_list
,
10298 list_del_init(&dev
->unlink_list
);
10299 dev
->nested_level
= dev
->lower_level
- 1;
10303 /* Snapshot list, allow later requests */
10304 list_replace_init(&net_todo_list
, &list
);
10309 /* Wait for rcu callbacks to finish before next phase */
10310 if (!list_empty(&list
))
10313 while (!list_empty(&list
)) {
10314 struct net_device
*dev
10315 = list_first_entry(&list
, struct net_device
, todo_list
);
10316 list_del(&dev
->todo_list
);
10318 if (unlikely(dev
->reg_state
!= NETREG_UNREGISTERING
)) {
10319 pr_err("network todo '%s' but state %d\n",
10320 dev
->name
, dev
->reg_state
);
10325 dev
->reg_state
= NETREG_UNREGISTERED
;
10327 netdev_wait_allrefs(dev
);
10330 BUG_ON(netdev_refcnt_read(dev
));
10331 BUG_ON(!list_empty(&dev
->ptype_all
));
10332 BUG_ON(!list_empty(&dev
->ptype_specific
));
10333 WARN_ON(rcu_access_pointer(dev
->ip_ptr
));
10334 WARN_ON(rcu_access_pointer(dev
->ip6_ptr
));
10335 #if IS_ENABLED(CONFIG_DECNET)
10336 WARN_ON(dev
->dn_ptr
);
10338 if (dev
->priv_destructor
)
10339 dev
->priv_destructor(dev
);
10340 if (dev
->needs_free_netdev
)
10343 /* Report a network device has been unregistered */
10345 dev_net(dev
)->dev_unreg_count
--;
10347 wake_up(&netdev_unregistering_wq
);
10349 /* Free network device */
10350 kobject_put(&dev
->dev
.kobj
);
10354 /* Convert net_device_stats to rtnl_link_stats64. rtnl_link_stats64 has
10355 * all the same fields in the same order as net_device_stats, with only
10356 * the type differing, but rtnl_link_stats64 may have additional fields
10357 * at the end for newer counters.
10359 void netdev_stats_to_stats64(struct rtnl_link_stats64
*stats64
,
10360 const struct net_device_stats
*netdev_stats
)
10362 #if BITS_PER_LONG == 64
10363 BUILD_BUG_ON(sizeof(*stats64
) < sizeof(*netdev_stats
));
10364 memcpy(stats64
, netdev_stats
, sizeof(*netdev_stats
));
10365 /* zero out counters that only exist in rtnl_link_stats64 */
10366 memset((char *)stats64
+ sizeof(*netdev_stats
), 0,
10367 sizeof(*stats64
) - sizeof(*netdev_stats
));
10369 size_t i
, n
= sizeof(*netdev_stats
) / sizeof(unsigned long);
10370 const unsigned long *src
= (const unsigned long *)netdev_stats
;
10371 u64
*dst
= (u64
*)stats64
;
10373 BUILD_BUG_ON(n
> sizeof(*stats64
) / sizeof(u64
));
10374 for (i
= 0; i
< n
; i
++)
10376 /* zero out counters that only exist in rtnl_link_stats64 */
10377 memset((char *)stats64
+ n
* sizeof(u64
), 0,
10378 sizeof(*stats64
) - n
* sizeof(u64
));
10381 EXPORT_SYMBOL(netdev_stats_to_stats64
);
10384 * dev_get_stats - get network device statistics
10385 * @dev: device to get statistics from
10386 * @storage: place to store stats
10388 * Get network statistics from device. Return @storage.
10389 * The device driver may provide its own method by setting
10390 * dev->netdev_ops->get_stats64 or dev->netdev_ops->get_stats;
10391 * otherwise the internal statistics structure is used.
10393 struct rtnl_link_stats64
*dev_get_stats(struct net_device
*dev
,
10394 struct rtnl_link_stats64
*storage
)
10396 const struct net_device_ops
*ops
= dev
->netdev_ops
;
10398 if (ops
->ndo_get_stats64
) {
10399 memset(storage
, 0, sizeof(*storage
));
10400 ops
->ndo_get_stats64(dev
, storage
);
10401 } else if (ops
->ndo_get_stats
) {
10402 netdev_stats_to_stats64(storage
, ops
->ndo_get_stats(dev
));
10404 netdev_stats_to_stats64(storage
, &dev
->stats
);
10406 storage
->rx_dropped
+= (unsigned long)atomic_long_read(&dev
->rx_dropped
);
10407 storage
->tx_dropped
+= (unsigned long)atomic_long_read(&dev
->tx_dropped
);
10408 storage
->rx_nohandler
+= (unsigned long)atomic_long_read(&dev
->rx_nohandler
);
10411 EXPORT_SYMBOL(dev_get_stats
);
10414 * dev_fetch_sw_netstats - get per-cpu network device statistics
10415 * @s: place to store stats
10416 * @netstats: per-cpu network stats to read from
10418 * Read per-cpu network statistics and populate the related fields in @s.
10420 void dev_fetch_sw_netstats(struct rtnl_link_stats64
*s
,
10421 const struct pcpu_sw_netstats __percpu
*netstats
)
10425 for_each_possible_cpu(cpu
) {
10426 const struct pcpu_sw_netstats
*stats
;
10427 struct pcpu_sw_netstats tmp
;
10428 unsigned int start
;
10430 stats
= per_cpu_ptr(netstats
, cpu
);
10432 start
= u64_stats_fetch_begin_irq(&stats
->syncp
);
10433 tmp
.rx_packets
= stats
->rx_packets
;
10434 tmp
.rx_bytes
= stats
->rx_bytes
;
10435 tmp
.tx_packets
= stats
->tx_packets
;
10436 tmp
.tx_bytes
= stats
->tx_bytes
;
10437 } while (u64_stats_fetch_retry_irq(&stats
->syncp
, start
));
10439 s
->rx_packets
+= tmp
.rx_packets
;
10440 s
->rx_bytes
+= tmp
.rx_bytes
;
10441 s
->tx_packets
+= tmp
.tx_packets
;
10442 s
->tx_bytes
+= tmp
.tx_bytes
;
10445 EXPORT_SYMBOL_GPL(dev_fetch_sw_netstats
);
10448 * dev_get_tstats64 - ndo_get_stats64 implementation
10449 * @dev: device to get statistics from
10450 * @s: place to store stats
10452 * Populate @s from dev->stats and dev->tstats. Can be used as
10453 * ndo_get_stats64() callback.
10455 void dev_get_tstats64(struct net_device
*dev
, struct rtnl_link_stats64
*s
)
10457 netdev_stats_to_stats64(s
, &dev
->stats
);
10458 dev_fetch_sw_netstats(s
, dev
->tstats
);
10460 EXPORT_SYMBOL_GPL(dev_get_tstats64
);
10462 struct netdev_queue
*dev_ingress_queue_create(struct net_device
*dev
)
10464 struct netdev_queue
*queue
= dev_ingress_queue(dev
);
10466 #ifdef CONFIG_NET_CLS_ACT
10469 queue
= kzalloc(sizeof(*queue
), GFP_KERNEL
);
10472 netdev_init_one_queue(dev
, queue
, NULL
);
10473 RCU_INIT_POINTER(queue
->qdisc
, &noop_qdisc
);
10474 queue
->qdisc_sleeping
= &noop_qdisc
;
10475 rcu_assign_pointer(dev
->ingress_queue
, queue
);
10480 static const struct ethtool_ops default_ethtool_ops
;
10482 void netdev_set_default_ethtool_ops(struct net_device
*dev
,
10483 const struct ethtool_ops
*ops
)
10485 if (dev
->ethtool_ops
== &default_ethtool_ops
)
10486 dev
->ethtool_ops
= ops
;
10488 EXPORT_SYMBOL_GPL(netdev_set_default_ethtool_ops
);
10490 void netdev_freemem(struct net_device
*dev
)
10492 char *addr
= (char *)dev
- dev
->padded
;
10498 * alloc_netdev_mqs - allocate network device
10499 * @sizeof_priv: size of private data to allocate space for
10500 * @name: device name format string
10501 * @name_assign_type: origin of device name
10502 * @setup: callback to initialize device
10503 * @txqs: the number of TX subqueues to allocate
10504 * @rxqs: the number of RX subqueues to allocate
10506 * Allocates a struct net_device with private data area for driver use
10507 * and performs basic initialization. Also allocates subqueue structs
10508 * for each queue on the device.
10510 struct net_device
*alloc_netdev_mqs(int sizeof_priv
, const char *name
,
10511 unsigned char name_assign_type
,
10512 void (*setup
)(struct net_device
*),
10513 unsigned int txqs
, unsigned int rxqs
)
10515 struct net_device
*dev
;
10516 unsigned int alloc_size
;
10517 struct net_device
*p
;
10519 BUG_ON(strlen(name
) >= sizeof(dev
->name
));
10522 pr_err("alloc_netdev: Unable to allocate device with zero queues\n");
10527 pr_err("alloc_netdev: Unable to allocate device with zero RX queues\n");
10531 alloc_size
= sizeof(struct net_device
);
10533 /* ensure 32-byte alignment of private area */
10534 alloc_size
= ALIGN(alloc_size
, NETDEV_ALIGN
);
10535 alloc_size
+= sizeof_priv
;
10537 /* ensure 32-byte alignment of whole construct */
10538 alloc_size
+= NETDEV_ALIGN
- 1;
10540 p
= kvzalloc(alloc_size
, GFP_KERNEL
| __GFP_RETRY_MAYFAIL
);
10544 dev
= PTR_ALIGN(p
, NETDEV_ALIGN
);
10545 dev
->padded
= (char *)dev
- (char *)p
;
10547 dev
->pcpu_refcnt
= alloc_percpu(int);
10548 if (!dev
->pcpu_refcnt
)
10551 if (dev_addr_init(dev
))
10557 dev_net_set(dev
, &init_net
);
10559 dev
->gso_max_size
= GSO_MAX_SIZE
;
10560 dev
->gso_max_segs
= GSO_MAX_SEGS
;
10561 dev
->upper_level
= 1;
10562 dev
->lower_level
= 1;
10563 #ifdef CONFIG_LOCKDEP
10564 dev
->nested_level
= 0;
10565 INIT_LIST_HEAD(&dev
->unlink_list
);
10568 INIT_LIST_HEAD(&dev
->napi_list
);
10569 INIT_LIST_HEAD(&dev
->unreg_list
);
10570 INIT_LIST_HEAD(&dev
->close_list
);
10571 INIT_LIST_HEAD(&dev
->link_watch_list
);
10572 INIT_LIST_HEAD(&dev
->adj_list
.upper
);
10573 INIT_LIST_HEAD(&dev
->adj_list
.lower
);
10574 INIT_LIST_HEAD(&dev
->ptype_all
);
10575 INIT_LIST_HEAD(&dev
->ptype_specific
);
10576 INIT_LIST_HEAD(&dev
->net_notifier_list
);
10577 #ifdef CONFIG_NET_SCHED
10578 hash_init(dev
->qdisc_hash
);
10580 dev
->priv_flags
= IFF_XMIT_DST_RELEASE
| IFF_XMIT_DST_RELEASE_PERM
;
10583 if (!dev
->tx_queue_len
) {
10584 dev
->priv_flags
|= IFF_NO_QUEUE
;
10585 dev
->tx_queue_len
= DEFAULT_TX_QUEUE_LEN
;
10588 dev
->num_tx_queues
= txqs
;
10589 dev
->real_num_tx_queues
= txqs
;
10590 if (netif_alloc_netdev_queues(dev
))
10593 dev
->num_rx_queues
= rxqs
;
10594 dev
->real_num_rx_queues
= rxqs
;
10595 if (netif_alloc_rx_queues(dev
))
10598 strcpy(dev
->name
, name
);
10599 dev
->name_assign_type
= name_assign_type
;
10600 dev
->group
= INIT_NETDEV_GROUP
;
10601 if (!dev
->ethtool_ops
)
10602 dev
->ethtool_ops
= &default_ethtool_ops
;
10604 nf_hook_ingress_init(dev
);
10613 free_percpu(dev
->pcpu_refcnt
);
10615 netdev_freemem(dev
);
10618 EXPORT_SYMBOL(alloc_netdev_mqs
);
10621 * free_netdev - free network device
10624 * This function does the last stage of destroying an allocated device
10625 * interface. The reference to the device object is released. If this
10626 * is the last reference then it will be freed.Must be called in process
10629 void free_netdev(struct net_device
*dev
)
10631 struct napi_struct
*p
, *n
;
10634 netif_free_tx_queues(dev
);
10635 netif_free_rx_queues(dev
);
10637 kfree(rcu_dereference_protected(dev
->ingress_queue
, 1));
10639 /* Flush device addresses */
10640 dev_addr_flush(dev
);
10642 list_for_each_entry_safe(p
, n
, &dev
->napi_list
, dev_list
)
10645 free_percpu(dev
->pcpu_refcnt
);
10646 dev
->pcpu_refcnt
= NULL
;
10647 free_percpu(dev
->xdp_bulkq
);
10648 dev
->xdp_bulkq
= NULL
;
10650 /* Compatibility with error handling in drivers */
10651 if (dev
->reg_state
== NETREG_UNINITIALIZED
) {
10652 netdev_freemem(dev
);
10656 BUG_ON(dev
->reg_state
!= NETREG_UNREGISTERED
);
10657 dev
->reg_state
= NETREG_RELEASED
;
10659 /* will free via device release */
10660 put_device(&dev
->dev
);
10662 EXPORT_SYMBOL(free_netdev
);
10665 * synchronize_net - Synchronize with packet receive processing
10667 * Wait for packets currently being received to be done.
10668 * Does not block later packets from starting.
10670 void synchronize_net(void)
10673 if (rtnl_is_locked())
10674 synchronize_rcu_expedited();
10678 EXPORT_SYMBOL(synchronize_net
);
10681 * unregister_netdevice_queue - remove device from the kernel
10685 * This function shuts down a device interface and removes it
10686 * from the kernel tables.
10687 * If head not NULL, device is queued to be unregistered later.
10689 * Callers must hold the rtnl semaphore. You may want
10690 * unregister_netdev() instead of this.
10693 void unregister_netdevice_queue(struct net_device
*dev
, struct list_head
*head
)
10698 list_move_tail(&dev
->unreg_list
, head
);
10700 rollback_registered(dev
);
10701 /* Finish processing unregister after unlock */
10705 EXPORT_SYMBOL(unregister_netdevice_queue
);
10708 * unregister_netdevice_many - unregister many devices
10709 * @head: list of devices
10711 * Note: As most callers use a stack allocated list_head,
10712 * we force a list_del() to make sure stack wont be corrupted later.
10714 void unregister_netdevice_many(struct list_head
*head
)
10716 struct net_device
*dev
;
10718 if (!list_empty(head
)) {
10719 rollback_registered_many(head
);
10720 list_for_each_entry(dev
, head
, unreg_list
)
10725 EXPORT_SYMBOL(unregister_netdevice_many
);
10728 * unregister_netdev - remove device from the kernel
10731 * This function shuts down a device interface and removes it
10732 * from the kernel tables.
10734 * This is just a wrapper for unregister_netdevice that takes
10735 * the rtnl semaphore. In general you want to use this and not
10736 * unregister_netdevice.
10738 void unregister_netdev(struct net_device
*dev
)
10741 unregister_netdevice(dev
);
10744 EXPORT_SYMBOL(unregister_netdev
);
10747 * dev_change_net_namespace - move device to different nethost namespace
10749 * @net: network namespace
10750 * @pat: If not NULL name pattern to try if the current device name
10751 * is already taken in the destination network namespace.
10753 * This function shuts down a device interface and moves it
10754 * to a new network namespace. On success 0 is returned, on
10755 * a failure a netagive errno code is returned.
10757 * Callers must hold the rtnl semaphore.
10760 int dev_change_net_namespace(struct net_device
*dev
, struct net
*net
, const char *pat
)
10762 struct net
*net_old
= dev_net(dev
);
10763 int err
, new_nsid
, new_ifindex
;
10767 /* Don't allow namespace local devices to be moved. */
10769 if (dev
->features
& NETIF_F_NETNS_LOCAL
)
10772 /* Ensure the device has been registrered */
10773 if (dev
->reg_state
!= NETREG_REGISTERED
)
10776 /* Get out if there is nothing todo */
10778 if (net_eq(net_old
, net
))
10781 /* Pick the destination device name, and ensure
10782 * we can use it in the destination network namespace.
10785 if (__dev_get_by_name(net
, dev
->name
)) {
10786 /* We get here if we can't use the current device name */
10789 err
= dev_get_valid_name(net
, dev
, pat
);
10795 * And now a mini version of register_netdevice unregister_netdevice.
10798 /* If device is running close it first. */
10801 /* And unlink it from device chain */
10802 unlist_netdevice(dev
);
10806 /* Shutdown queueing discipline. */
10809 /* Notify protocols, that we are about to destroy
10810 * this device. They should clean all the things.
10812 * Note that dev->reg_state stays at NETREG_REGISTERED.
10813 * This is wanted because this way 8021q and macvlan know
10814 * the device is just moving and can keep their slaves up.
10816 call_netdevice_notifiers(NETDEV_UNREGISTER
, dev
);
10819 new_nsid
= peernet2id_alloc(dev_net(dev
), net
, GFP_KERNEL
);
10820 /* If there is an ifindex conflict assign a new one */
10821 if (__dev_get_by_index(net
, dev
->ifindex
))
10822 new_ifindex
= dev_new_index(net
);
10824 new_ifindex
= dev
->ifindex
;
10826 rtmsg_ifinfo_newnet(RTM_DELLINK
, dev
, ~0U, GFP_KERNEL
, &new_nsid
,
10830 * Flush the unicast and multicast chains
10835 /* Send a netdev-removed uevent to the old namespace */
10836 kobject_uevent(&dev
->dev
.kobj
, KOBJ_REMOVE
);
10837 netdev_adjacent_del_links(dev
);
10839 /* Move per-net netdevice notifiers that are following the netdevice */
10840 move_netdevice_notifiers_dev_net(dev
, net
);
10842 /* Actually switch the network namespace */
10843 dev_net_set(dev
, net
);
10844 dev
->ifindex
= new_ifindex
;
10846 /* Send a netdev-add uevent to the new namespace */
10847 kobject_uevent(&dev
->dev
.kobj
, KOBJ_ADD
);
10848 netdev_adjacent_add_links(dev
);
10850 /* Fixup kobjects */
10851 err
= device_rename(&dev
->dev
, dev
->name
);
10854 /* Adapt owner in case owning user namespace of target network
10855 * namespace is different from the original one.
10857 err
= netdev_change_owner(dev
, net_old
, net
);
10860 /* Add the device back in the hashes */
10861 list_netdevice(dev
);
10863 /* Notify protocols, that a new device appeared. */
10864 call_netdevice_notifiers(NETDEV_REGISTER
, dev
);
10867 * Prevent userspace races by waiting until the network
10868 * device is fully setup before sending notifications.
10870 rtmsg_ifinfo(RTM_NEWLINK
, dev
, ~0U, GFP_KERNEL
);
10877 EXPORT_SYMBOL_GPL(dev_change_net_namespace
);
10879 static int dev_cpu_dead(unsigned int oldcpu
)
10881 struct sk_buff
**list_skb
;
10882 struct sk_buff
*skb
;
10884 struct softnet_data
*sd
, *oldsd
, *remsd
= NULL
;
10886 local_irq_disable();
10887 cpu
= smp_processor_id();
10888 sd
= &per_cpu(softnet_data
, cpu
);
10889 oldsd
= &per_cpu(softnet_data
, oldcpu
);
10891 /* Find end of our completion_queue. */
10892 list_skb
= &sd
->completion_queue
;
10894 list_skb
= &(*list_skb
)->next
;
10895 /* Append completion queue from offline CPU. */
10896 *list_skb
= oldsd
->completion_queue
;
10897 oldsd
->completion_queue
= NULL
;
10899 /* Append output queue from offline CPU. */
10900 if (oldsd
->output_queue
) {
10901 *sd
->output_queue_tailp
= oldsd
->output_queue
;
10902 sd
->output_queue_tailp
= oldsd
->output_queue_tailp
;
10903 oldsd
->output_queue
= NULL
;
10904 oldsd
->output_queue_tailp
= &oldsd
->output_queue
;
10906 /* Append NAPI poll list from offline CPU, with one exception :
10907 * process_backlog() must be called by cpu owning percpu backlog.
10908 * We properly handle process_queue & input_pkt_queue later.
10910 while (!list_empty(&oldsd
->poll_list
)) {
10911 struct napi_struct
*napi
= list_first_entry(&oldsd
->poll_list
,
10912 struct napi_struct
,
10915 list_del_init(&napi
->poll_list
);
10916 if (napi
->poll
== process_backlog
)
10919 ____napi_schedule(sd
, napi
);
10922 raise_softirq_irqoff(NET_TX_SOFTIRQ
);
10923 local_irq_enable();
10926 remsd
= oldsd
->rps_ipi_list
;
10927 oldsd
->rps_ipi_list
= NULL
;
10929 /* send out pending IPI's on offline CPU */
10930 net_rps_send_ipi(remsd
);
10932 /* Process offline CPU's input_pkt_queue */
10933 while ((skb
= __skb_dequeue(&oldsd
->process_queue
))) {
10935 input_queue_head_incr(oldsd
);
10937 while ((skb
= skb_dequeue(&oldsd
->input_pkt_queue
))) {
10939 input_queue_head_incr(oldsd
);
10946 * netdev_increment_features - increment feature set by one
10947 * @all: current feature set
10948 * @one: new feature set
10949 * @mask: mask feature set
10951 * Computes a new feature set after adding a device with feature set
10952 * @one to the master device with current feature set @all. Will not
10953 * enable anything that is off in @mask. Returns the new feature set.
10955 netdev_features_t
netdev_increment_features(netdev_features_t all
,
10956 netdev_features_t one
, netdev_features_t mask
)
10958 if (mask
& NETIF_F_HW_CSUM
)
10959 mask
|= NETIF_F_CSUM_MASK
;
10960 mask
|= NETIF_F_VLAN_CHALLENGED
;
10962 all
|= one
& (NETIF_F_ONE_FOR_ALL
| NETIF_F_CSUM_MASK
) & mask
;
10963 all
&= one
| ~NETIF_F_ALL_FOR_ALL
;
10965 /* If one device supports hw checksumming, set for all. */
10966 if (all
& NETIF_F_HW_CSUM
)
10967 all
&= ~(NETIF_F_CSUM_MASK
& ~NETIF_F_HW_CSUM
);
10971 EXPORT_SYMBOL(netdev_increment_features
);
10973 static struct hlist_head
* __net_init
netdev_create_hash(void)
10976 struct hlist_head
*hash
;
10978 hash
= kmalloc_array(NETDEV_HASHENTRIES
, sizeof(*hash
), GFP_KERNEL
);
10980 for (i
= 0; i
< NETDEV_HASHENTRIES
; i
++)
10981 INIT_HLIST_HEAD(&hash
[i
]);
10986 /* Initialize per network namespace state */
10987 static int __net_init
netdev_init(struct net
*net
)
10989 BUILD_BUG_ON(GRO_HASH_BUCKETS
>
10990 8 * sizeof_field(struct napi_struct
, gro_bitmask
));
10992 if (net
!= &init_net
)
10993 INIT_LIST_HEAD(&net
->dev_base_head
);
10995 net
->dev_name_head
= netdev_create_hash();
10996 if (net
->dev_name_head
== NULL
)
10999 net
->dev_index_head
= netdev_create_hash();
11000 if (net
->dev_index_head
== NULL
)
11003 RAW_INIT_NOTIFIER_HEAD(&net
->netdev_chain
);
11008 kfree(net
->dev_name_head
);
11014 * netdev_drivername - network driver for the device
11015 * @dev: network device
11017 * Determine network driver for device.
11019 const char *netdev_drivername(const struct net_device
*dev
)
11021 const struct device_driver
*driver
;
11022 const struct device
*parent
;
11023 const char *empty
= "";
11025 parent
= dev
->dev
.parent
;
11029 driver
= parent
->driver
;
11030 if (driver
&& driver
->name
)
11031 return driver
->name
;
11035 static void __netdev_printk(const char *level
, const struct net_device
*dev
,
11036 struct va_format
*vaf
)
11038 if (dev
&& dev
->dev
.parent
) {
11039 dev_printk_emit(level
[1] - '0',
11042 dev_driver_string(dev
->dev
.parent
),
11043 dev_name(dev
->dev
.parent
),
11044 netdev_name(dev
), netdev_reg_state(dev
),
11047 printk("%s%s%s: %pV",
11048 level
, netdev_name(dev
), netdev_reg_state(dev
), vaf
);
11050 printk("%s(NULL net_device): %pV", level
, vaf
);
11054 void netdev_printk(const char *level
, const struct net_device
*dev
,
11055 const char *format
, ...)
11057 struct va_format vaf
;
11060 va_start(args
, format
);
11065 __netdev_printk(level
, dev
, &vaf
);
11069 EXPORT_SYMBOL(netdev_printk
);
11071 #define define_netdev_printk_level(func, level) \
11072 void func(const struct net_device *dev, const char *fmt, ...) \
11074 struct va_format vaf; \
11077 va_start(args, fmt); \
11082 __netdev_printk(level, dev, &vaf); \
11086 EXPORT_SYMBOL(func);
11088 define_netdev_printk_level(netdev_emerg
, KERN_EMERG
);
11089 define_netdev_printk_level(netdev_alert
, KERN_ALERT
);
11090 define_netdev_printk_level(netdev_crit
, KERN_CRIT
);
11091 define_netdev_printk_level(netdev_err
, KERN_ERR
);
11092 define_netdev_printk_level(netdev_warn
, KERN_WARNING
);
11093 define_netdev_printk_level(netdev_notice
, KERN_NOTICE
);
11094 define_netdev_printk_level(netdev_info
, KERN_INFO
);
11096 static void __net_exit
netdev_exit(struct net
*net
)
11098 kfree(net
->dev_name_head
);
11099 kfree(net
->dev_index_head
);
11100 if (net
!= &init_net
)
11101 WARN_ON_ONCE(!list_empty(&net
->dev_base_head
));
11104 static struct pernet_operations __net_initdata netdev_net_ops
= {
11105 .init
= netdev_init
,
11106 .exit
= netdev_exit
,
11109 static void __net_exit
default_device_exit(struct net
*net
)
11111 struct net_device
*dev
, *aux
;
11113 * Push all migratable network devices back to the
11114 * initial network namespace
11117 for_each_netdev_safe(net
, dev
, aux
) {
11119 char fb_name
[IFNAMSIZ
];
11121 /* Ignore unmoveable devices (i.e. loopback) */
11122 if (dev
->features
& NETIF_F_NETNS_LOCAL
)
11125 /* Leave virtual devices for the generic cleanup */
11126 if (dev
->rtnl_link_ops
)
11129 /* Push remaining network devices to init_net */
11130 snprintf(fb_name
, IFNAMSIZ
, "dev%d", dev
->ifindex
);
11131 if (__dev_get_by_name(&init_net
, fb_name
))
11132 snprintf(fb_name
, IFNAMSIZ
, "dev%%d");
11133 err
= dev_change_net_namespace(dev
, &init_net
, fb_name
);
11135 pr_emerg("%s: failed to move %s to init_net: %d\n",
11136 __func__
, dev
->name
, err
);
11143 static void __net_exit
rtnl_lock_unregistering(struct list_head
*net_list
)
11145 /* Return with the rtnl_lock held when there are no network
11146 * devices unregistering in any network namespace in net_list.
11149 bool unregistering
;
11150 DEFINE_WAIT_FUNC(wait
, woken_wake_function
);
11152 add_wait_queue(&netdev_unregistering_wq
, &wait
);
11154 unregistering
= false;
11156 list_for_each_entry(net
, net_list
, exit_list
) {
11157 if (net
->dev_unreg_count
> 0) {
11158 unregistering
= true;
11162 if (!unregistering
)
11166 wait_woken(&wait
, TASK_UNINTERRUPTIBLE
, MAX_SCHEDULE_TIMEOUT
);
11168 remove_wait_queue(&netdev_unregistering_wq
, &wait
);
11171 static void __net_exit
default_device_exit_batch(struct list_head
*net_list
)
11173 /* At exit all network devices most be removed from a network
11174 * namespace. Do this in the reverse order of registration.
11175 * Do this across as many network namespaces as possible to
11176 * improve batching efficiency.
11178 struct net_device
*dev
;
11180 LIST_HEAD(dev_kill_list
);
11182 /* To prevent network device cleanup code from dereferencing
11183 * loopback devices or network devices that have been freed
11184 * wait here for all pending unregistrations to complete,
11185 * before unregistring the loopback device and allowing the
11186 * network namespace be freed.
11188 * The netdev todo list containing all network devices
11189 * unregistrations that happen in default_device_exit_batch
11190 * will run in the rtnl_unlock() at the end of
11191 * default_device_exit_batch.
11193 rtnl_lock_unregistering(net_list
);
11194 list_for_each_entry(net
, net_list
, exit_list
) {
11195 for_each_netdev_reverse(net
, dev
) {
11196 if (dev
->rtnl_link_ops
&& dev
->rtnl_link_ops
->dellink
)
11197 dev
->rtnl_link_ops
->dellink(dev
, &dev_kill_list
);
11199 unregister_netdevice_queue(dev
, &dev_kill_list
);
11202 unregister_netdevice_many(&dev_kill_list
);
11206 static struct pernet_operations __net_initdata default_device_ops
= {
11207 .exit
= default_device_exit
,
11208 .exit_batch
= default_device_exit_batch
,
11212 * Initialize the DEV module. At boot time this walks the device list and
11213 * unhooks any devices that fail to initialise (normally hardware not
11214 * present) and leaves us with a valid list of present and active devices.
11219 * This is called single threaded during boot, so no need
11220 * to take the rtnl semaphore.
11222 static int __init
net_dev_init(void)
11224 int i
, rc
= -ENOMEM
;
11226 BUG_ON(!dev_boot_phase
);
11228 if (dev_proc_init())
11231 if (netdev_kobject_init())
11234 INIT_LIST_HEAD(&ptype_all
);
11235 for (i
= 0; i
< PTYPE_HASH_SIZE
; i
++)
11236 INIT_LIST_HEAD(&ptype_base
[i
]);
11238 INIT_LIST_HEAD(&offload_base
);
11240 if (register_pernet_subsys(&netdev_net_ops
))
11244 * Initialise the packet receive queues.
11247 for_each_possible_cpu(i
) {
11248 struct work_struct
*flush
= per_cpu_ptr(&flush_works
, i
);
11249 struct softnet_data
*sd
= &per_cpu(softnet_data
, i
);
11251 INIT_WORK(flush
, flush_backlog
);
11253 skb_queue_head_init(&sd
->input_pkt_queue
);
11254 skb_queue_head_init(&sd
->process_queue
);
11255 #ifdef CONFIG_XFRM_OFFLOAD
11256 skb_queue_head_init(&sd
->xfrm_backlog
);
11258 INIT_LIST_HEAD(&sd
->poll_list
);
11259 sd
->output_queue_tailp
= &sd
->output_queue
;
11261 INIT_CSD(&sd
->csd
, rps_trigger_softirq
, sd
);
11265 init_gro_hash(&sd
->backlog
);
11266 sd
->backlog
.poll
= process_backlog
;
11267 sd
->backlog
.weight
= weight_p
;
11270 dev_boot_phase
= 0;
11272 /* The loopback device is special if any other network devices
11273 * is present in a network namespace the loopback device must
11274 * be present. Since we now dynamically allocate and free the
11275 * loopback device ensure this invariant is maintained by
11276 * keeping the loopback device as the first device on the
11277 * list of network devices. Ensuring the loopback devices
11278 * is the first device that appears and the last network device
11281 if (register_pernet_device(&loopback_net_ops
))
11284 if (register_pernet_device(&default_device_ops
))
11287 open_softirq(NET_TX_SOFTIRQ
, net_tx_action
);
11288 open_softirq(NET_RX_SOFTIRQ
, net_rx_action
);
11290 rc
= cpuhp_setup_state_nocalls(CPUHP_NET_DEV_DEAD
, "net/dev:dead",
11291 NULL
, dev_cpu_dead
);
11298 subsys_initcall(net_dev_init
);