net/core/dev.c

   1 /*
   2  *      NET3    Protocol independent device support routines.
   3  *
   4  *              This program is free software; you can redistribute it and/or
   5  *              modify it under the terms of the GNU General Public License
   6  *              as published by the Free Software Foundation; either version
   7  *              2 of the License, or (at your option) any later version.
   8  *
   9  *      Derived from the non IP parts of dev.c 1.0.19
  10  *              Authors:        Ross Biro
  11  *                              Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
  12  *                              Mark Evans, <evansmp@uhura.aston.ac.uk>
  13  *
  14  *      Additional Authors:
  15  *              Florian la Roche <rzsfl@rz.uni-sb.de>
  16  *              Alan Cox <gw4pts@gw4pts.ampr.org>
  17  *              David Hinds <dahinds@users.sourceforge.net>
  18  *              Alexey Kuznetsov <kuznet@ms2.inr.ac.ru>
  19  *              Adam Sulmicki <adam@cfar.umd.edu>
  20  *              Pekka Riikonen <priikone@poesidon.pspt.fi>
  21  *
  22  *      Changes:
  23  *              D.J. Barrow     :       Fixed bug where dev->refcnt gets set
  24  *                                      to 2 if register_netdev gets called
  25  *                                      before net_dev_init & also removed a
  26  *                                      few lines of code in the process.
  27  *              Alan Cox        :       device private ioctl copies fields back.
  28  *              Alan Cox        :       Transmit queue code does relevant
  29  *                                      stunts to keep the queue safe.
  30  *              Alan Cox        :       Fixed double lock.
  31  *              Alan Cox        :       Fixed promisc NULL pointer trap
  32  *              ????????        :       Support the full private ioctl range
  33  *              Alan Cox        :       Moved ioctl permission check into
  34  *                                      drivers
  35  *              Tim Kordas      :       SIOCADDMULTI/SIOCDELMULTI
  36  *              Alan Cox        :       100 backlog just doesn't cut it when
  37  *                                      you start doing multicast video 8)
  38  *              Alan Cox        :       Rewrote net_bh and list manager.
  39  *              Alan Cox        :       Fix ETH_P_ALL echoback lengths.
  40  *              Alan Cox        :       Took out transmit every packet pass
  41  *                                      Saved a few bytes in the ioctl handler
  42  *              Alan Cox        :       Network driver sets packet type before
  43  *                                      calling netif_rx. Saves a function
  44  *                                      call a packet.
  45  *              Alan Cox        :       Hashed net_bh()
  46  *              Richard Kooijman:       Timestamp fixes.
  47  *              Alan Cox        :       Wrong field in SIOCGIFDSTADDR
  48  *              Alan Cox        :       Device lock protection.
  49  *              Alan Cox        :       Fixed nasty side effect of device close
  50  *                                      changes.
  51  *              Rudi Cilibrasi  :       Pass the right thing to
  52  *                                      set_mac_address()
  53  *              Dave Miller     :       32bit quantity for the device lock to
  54  *                                      make it work out on a Sparc.
  55  *              Bjorn Ekwall    :       Added KERNELD hack.
  56  *              Alan Cox        :       Cleaned up the backlog initialise.
  57  *              Craig Metz      :       SIOCGIFCONF fix if space for under
  58  *                                      1 device.
  59  *          Thomas Bogendoerfer :       Return ENODEV for dev_open, if there
  60  *                                      is no device open function.
  61  *              Andi Kleen      :       Fix error reporting for SIOCGIFCONF
  62  *          Michael Chastain    :       Fix signed/unsigned for SIOCGIFCONF
  63  *              Cyrus Durgin    :       Cleaned for KMOD
  64  *              Adam Sulmicki   :       Bug Fix : Network Device Unload
  65  *                                      A network device unload needs to purge
  66  *                                      the backlog queue.
  67  *      Paul Rusty Russell      :       SIOCSIFNAME
  68  *              Pekka Riikonen  :       Netdev boot-time settings code
  69  *              Andrew Morton   :       Make unregister_netdevice wait
  70  *                                      indefinitely on dev->refcnt
  71  *              J Hadi Salim    :       - Backlog queue sampling
  72  *                                      - netif_rx() feedback
  73  */
  74
  75 #include <asm/uaccess.h>
  76 #include <linux/bitops.h>
  77 #include <linux/capability.h>
  78 #include <linux/cpu.h>
  79 #include <linux/types.h>
  80 #include <linux/kernel.h>
  81 #include <linux/hash.h>
  82 #include <linux/slab.h>
  83 #include <linux/sched.h>
  84 #include <linux/mutex.h>
  85 #include <linux/string.h>
  86 #include <linux/mm.h>
  87 #include <linux/socket.h>
  88 #include <linux/sockios.h>
  89 #include <linux/errno.h>
  90 #include <linux/interrupt.h>
  91 #include <linux/if_ether.h>
  92 #include <linux/netdevice.h>
  93 #include <linux/etherdevice.h>
  94 #include <linux/ethtool.h>
  95 #include <linux/notifier.h>
  96 #include <linux/skbuff.h>
  97 #include <linux/bpf.h>
  98 #include <net/net_namespace.h>
  99 #include <net/sock.h>
 100 #include <net/busy_poll.h>
 101 #include <linux/rtnetlink.h>
 102 #include <linux/stat.h>
 103 #include <net/dst.h>
 104 #include <net/dst_metadata.h>
 105 #include <net/pkt_sched.h>
 106 #include <net/checksum.h>
 107 #include <net/xfrm.h>
 108 #include <linux/highmem.h>
 109 #include <linux/init.h>
 110 #include <linux/module.h>
 111 #include <linux/netpoll.h>
 112 #include <linux/rcupdate.h>
 113 #include <linux/delay.h>
 114 #include <net/iw_handler.h>
 115 #include <asm/current.h>
 116 #include <linux/audit.h>
 117 #include <linux/dmaengine.h>
 118 #include <linux/err.h>
 119 #include <linux/ctype.h>
 120 #include <linux/if_arp.h>
 121 #include <linux/if_vlan.h>
 122 #include <linux/ip.h>
 123 #include <net/ip.h>
 124 #include <net/mpls.h>
 125 #include <linux/ipv6.h>
 126 #include <linux/in.h>
 127 #include <linux/jhash.h>
 128 #include <linux/random.h>
 129 #include <trace/events/napi.h>
 130 #include <trace/events/net.h>
 131 #include <trace/events/skb.h>
 132 #include <linux/pci.h>
 133 #include <linux/inetdevice.h>
 134 #include <linux/cpu_rmap.h>
 135 #include <linux/static_key.h>
 136 #include <linux/hashtable.h>
 137 #include <linux/vmalloc.h>
 138 #include <linux/if_macvlan.h>
 139 #include <linux/errqueue.h>
 140 #include <linux/hrtimer.h>
 141 #include <linux/netfilter_ingress.h>
 142 #include <linux/sctp.h>
 143 #include <linux/crash_dump.h>
 144
 145 #include "net-sysfs.h"
 146
 147 /* Instead of increasing this, you should create a hash table. */
 148 #define MAX_GRO_SKBS 8
 149
 150 /* This should be increased if a protocol with a bigger head is added. */
 151 #define GRO_MAX_HEAD (MAX_HEADER + 128)
 152
 153 static DEFINE_SPINLOCK(ptype_lock);
 154 static DEFINE_SPINLOCK(offload_lock);
 155 struct list_head ptype_base[PTYPE_HASH_SIZE] __read_mostly;
 156 struct list_head ptype_all __read_mostly;       /* Taps */
 157 static struct list_head offload_base __read_mostly;
 158
 159 static int netif_rx_internal(struct sk_buff *skb);
 160 static int call_netdevice_notifiers_info(unsigned long val,
 161                                          struct net_device *dev,
 162                                          struct netdev_notifier_info *info);
 163
 164 /*
 165  * The @dev_base_head list is protected by @dev_base_lock and the rtnl
 166  * semaphore.
 167  *
 168  * Pure readers hold dev_base_lock for reading, or rcu_read_lock()
 169  *
 170  * Writers must hold the rtnl semaphore while they loop through the
 171  * dev_base_head list, and hold dev_base_lock for writing when they do the
 172  * actual updates.  This allows pure readers to access the list even
 173  * while a writer is preparing to update it.
 174  *
 175  * To put it another way, dev_base_lock is held for writing only to
 176  * protect against pure readers; the rtnl semaphore provides the
 177  * protection against other writers.
 178  *
 179  * See, for example usages, register_netdevice() and
 180  * unregister_netdevice(), which must be called with the rtnl
 181  * semaphore held.
 182  */
 183 DEFINE_RWLOCK(dev_base_lock);
 184 EXPORT_SYMBOL(dev_base_lock);
 185
 186 /* protects napi_hash addition/deletion and napi_gen_id */
 187 static DEFINE_SPINLOCK(napi_hash_lock);
 188
 189 static unsigned int napi_gen_id = NR_CPUS;
 190 static DEFINE_READ_MOSTLY_HASHTABLE(napi_hash, 8);
 191
 192 static seqcount_t devnet_rename_seq;
 193
 194 static inline void dev_base_seq_inc(struct net *net)
 195 {
 196         while (++net->dev_base_seq == 0);
 197 }
 198
 199 static inline struct hlist_head *dev_name_hash(struct net *net, const char *name)
 200 {
 201         unsigned int hash = full_name_hash(net, name, strnlen(name, IFNAMSIZ));
 202
 203         return &net->dev_name_head[hash_32(hash, NETDEV_HASHBITS)];
 204 }
 205
 206 static inline struct hlist_head *dev_index_hash(struct net *net, int ifindex)
 207 {
 208         return &net->dev_index_head[ifindex & (NETDEV_HASHENTRIES - 1)];
 209 }
 210
 211 static inline void rps_lock(struct softnet_data *sd)
 212 {
 213 #ifdef CONFIG_RPS
 214         spin_lock(&sd->input_pkt_queue.lock);
 215 #endif
 216 }
 217
 218 static inline void rps_unlock(struct softnet_data *sd)
 219 {
 220 #ifdef CONFIG_RPS
 221         spin_unlock(&sd->input_pkt_queue.lock);
 222 #endif
 223 }
 224
 225 /* Device list insertion */
 226 static void list_netdevice(struct net_device *dev)
 227 {
 228         struct net *net = dev_net(dev);
 229
 230         ASSERT_RTNL();
 231
 232         write_lock_bh(&dev_base_lock);
 233         list_add_tail_rcu(&dev->dev_list, &net->dev_base_head);
 234         hlist_add_head_rcu(&dev->name_hlist, dev_name_hash(net, dev->name));
 235         hlist_add_head_rcu(&dev->index_hlist,
 236                            dev_index_hash(net, dev->ifindex));
 237         write_unlock_bh(&dev_base_lock);
 238
 239         dev_base_seq_inc(net);
 240 }
 241
 242 /* Device list removal
 243  * caller must respect a RCU grace period before freeing/reusing dev
 244  */
 245 static void unlist_netdevice(struct net_device *dev)
 246 {
 247         ASSERT_RTNL();
 248
 249         /* Unlink dev from the device chain */
 250         write_lock_bh(&dev_base_lock);
 251         list_del_rcu(&dev->dev_list);
 252         hlist_del_rcu(&dev->name_hlist);
 253         hlist_del_rcu(&dev->index_hlist);
 254         write_unlock_bh(&dev_base_lock);
 255
 256         dev_base_seq_inc(dev_net(dev));
 257 }
 258
 259 /*
 260  *      Our notifier list
 261  */
 262
 263 static RAW_NOTIFIER_HEAD(netdev_chain);
 264
 265 /*
 266  *      Device drivers call our routines to queue packets here. We empty the
 267  *      queue in the local softnet handler.
 268  */
 269
 270 DEFINE_PER_CPU_ALIGNED(struct softnet_data, softnet_data);
 271 EXPORT_PER_CPU_SYMBOL(softnet_data);
 272
 273 #ifdef CONFIG_LOCKDEP
 274 /*
 275  * register_netdevice() inits txq->_xmit_lock and sets lockdep class
 276  * according to dev->type
 277  */
 278 static const unsigned short netdev_lock_type[] =
 279         {ARPHRD_NETROM, ARPHRD_ETHER, ARPHRD_EETHER, ARPHRD_AX25,
 280          ARPHRD_PRONET, ARPHRD_CHAOS, ARPHRD_IEEE802, ARPHRD_ARCNET,
 281          ARPHRD_APPLETLK, ARPHRD_DLCI, ARPHRD_ATM, ARPHRD_METRICOM,
 282          ARPHRD_IEEE1394, ARPHRD_EUI64, ARPHRD_INFINIBAND, ARPHRD_SLIP,
 283          ARPHRD_CSLIP, ARPHRD_SLIP6, ARPHRD_CSLIP6, ARPHRD_RSRVD,
 284          ARPHRD_ADAPT, ARPHRD_ROSE, ARPHRD_X25, ARPHRD_HWX25,
 285          ARPHRD_PPP, ARPHRD_CISCO, ARPHRD_LAPB, ARPHRD_DDCMP,
 286          ARPHRD_RAWHDLC, ARPHRD_TUNNEL, ARPHRD_TUNNEL6, ARPHRD_FRAD,
 287          ARPHRD_SKIP, ARPHRD_LOOPBACK, ARPHRD_LOCALTLK, ARPHRD_FDDI,
 288          ARPHRD_BIF, ARPHRD_SIT, ARPHRD_IPDDP, ARPHRD_IPGRE,
 289          ARPHRD_PIMREG, ARPHRD_HIPPI, ARPHRD_ASH, ARPHRD_ECONET,
 290          ARPHRD_IRDA, ARPHRD_FCPP, ARPHRD_FCAL, ARPHRD_FCPL,
 291          ARPHRD_FCFABRIC, ARPHRD_IEEE80211, ARPHRD_IEEE80211_PRISM,
 292          ARPHRD_IEEE80211_RADIOTAP, ARPHRD_PHONET, ARPHRD_PHONET_PIPE,
 293          ARPHRD_IEEE802154, ARPHRD_VOID, ARPHRD_NONE};
 294
 295 static const char *const netdev_lock_name[] =
 296         {"_xmit_NETROM", "_xmit_ETHER", "_xmit_EETHER", "_xmit_AX25",
 297          "_xmit_PRONET", "_xmit_CHAOS", "_xmit_IEEE802", "_xmit_ARCNET",
 298          "_xmit_APPLETLK", "_xmit_DLCI", "_xmit_ATM", "_xmit_METRICOM",
 299          "_xmit_IEEE1394", "_xmit_EUI64", "_xmit_INFINIBAND", "_xmit_SLIP",
 300          "_xmit_CSLIP", "_xmit_SLIP6", "_xmit_CSLIP6", "_xmit_RSRVD",
 301          "_xmit_ADAPT", "_xmit_ROSE", "_xmit_X25", "_xmit_HWX25",
 302          "_xmit_PPP", "_xmit_CISCO", "_xmit_LAPB", "_xmit_DDCMP",
 303          "_xmit_RAWHDLC", "_xmit_TUNNEL", "_xmit_TUNNEL6", "_xmit_FRAD",
 304          "_xmit_SKIP", "_xmit_LOOPBACK", "_xmit_LOCALTLK", "_xmit_FDDI",
 305          "_xmit_BIF", "_xmit_SIT", "_xmit_IPDDP", "_xmit_IPGRE",
 306          "_xmit_PIMREG", "_xmit_HIPPI", "_xmit_ASH", "_xmit_ECONET",
 307          "_xmit_IRDA", "_xmit_FCPP", "_xmit_FCAL", "_xmit_FCPL",
 308          "_xmit_FCFABRIC", "_xmit_IEEE80211", "_xmit_IEEE80211_PRISM",
 309          "_xmit_IEEE80211_RADIOTAP", "_xmit_PHONET", "_xmit_PHONET_PIPE",
 310          "_xmit_IEEE802154", "_xmit_VOID", "_xmit_NONE"};
 311
 312 static struct lock_class_key netdev_xmit_lock_key[ARRAY_SIZE(netdev_lock_type)];
 313 static struct lock_class_key netdev_addr_lock_key[ARRAY_SIZE(netdev_lock_type)];
 314
 315 static inline unsigned short netdev_lock_pos(unsigned short dev_type)
 316 {
 317         int i;
 318
 319         for (i = 0; i < ARRAY_SIZE(netdev_lock_type); i++)
 320                 if (netdev_lock_type[i] == dev_type)
 321                         return i;
 322         /* the last key is used by default */
 323         return ARRAY_SIZE(netdev_lock_type) - 1;
 324 }
 325
 326 static inline void netdev_set_xmit_lockdep_class(spinlock_t *lock,
 327                                                  unsigned short dev_type)
 328 {
 329         int i;
 330
 331         i = netdev_lock_pos(dev_type);
 332         lockdep_set_class_and_name(lock, &netdev_xmit_lock_key[i],
 333                                    netdev_lock_name[i]);
 334 }
 335
 336 static inline void netdev_set_addr_lockdep_class(struct net_device *dev)
 337 {
 338         int i;
 339
 340         i = netdev_lock_pos(dev->type);
 341         lockdep_set_class_and_name(&dev->addr_list_lock,
 342                                    &netdev_addr_lock_key[i],
 343                                    netdev_lock_name[i]);
 344 }
 345 #else
 346 static inline void netdev_set_xmit_lockdep_class(spinlock_t *lock,
 347                                                  unsigned short dev_type)
 348 {
 349 }
 350 static inline void netdev_set_addr_lockdep_class(struct net_device *dev)
 351 {
 352 }
 353 #endif
 354
 355 /*******************************************************************************
 356
 357                 Protocol management and registration routines
 358
 359 *******************************************************************************/
 360
 361 /*
 362  *      Add a protocol ID to the list. Now that the input handler is
 363  *      smarter we can dispense with all the messy stuff that used to be
 364  *      here.
 365  *
 366  *      BEWARE!!! Protocol handlers, mangling input packets,
 367  *      MUST BE last in hash buckets and checking protocol handlers
 368  *      MUST start from promiscuous ptype_all chain in net_bh.
 369  *      It is true now, do not change it.
 370  *      Explanation follows: if protocol handler, mangling packet, will
 371  *      be the first on list, it is not able to sense, that packet
 372  *      is cloned and should be copied-on-write, so that it will
 373  *      change it and subsequent readers will get broken packet.
 374  *                                                      --ANK (980803)
 375  */
 376
 377 static inline struct list_head *ptype_head(const struct packet_type *pt)
 378 {
 379         if (pt->type == htons(ETH_P_ALL))
 380                 return pt->dev ? &pt->dev->ptype_all : &ptype_all;
 381         else
 382                 return pt->dev ? &pt->dev->ptype_specific :
 383                                  &ptype_base[ntohs(pt->type) & PTYPE_HASH_MASK];
 384 }
 385
 386 /**
 387  *      dev_add_pack - add packet handler
 388  *      @pt: packet type declaration
 389  *
 390  *      Add a protocol handler to the networking stack. The passed &packet_type
 391  *      is linked into kernel lists and may not be freed until it has been
 392  *      removed from the kernel lists.
 393  *
 394  *      This call does not sleep therefore it can not
 395  *      guarantee all CPU's that are in middle of receiving packets
 396  *      will see the new packet type (until the next received packet).
 397  */
 398
 399 void dev_add_pack(struct packet_type *pt)
 400 {
 401         struct list_head *head = ptype_head(pt);
 402
 403         spin_lock(&ptype_lock);
 404         list_add_rcu(&pt->list, head);
 405         spin_unlock(&ptype_lock);
 406 }
 407 EXPORT_SYMBOL(dev_add_pack);
 408
 409 /**
 410  *      __dev_remove_pack        - remove packet handler
 411  *      @pt: packet type declaration
 412  *
 413  *      Remove a protocol handler that was previously added to the kernel
 414  *      protocol handlers by dev_add_pack(). The passed &packet_type is removed
 415  *      from the kernel lists and can be freed or reused once this function
 416  *      returns.
 417  *
 418  *      The packet type might still be in use by receivers
 419  *      and must not be freed until after all the CPU's have gone
 420  *      through a quiescent state.
 421  */
 422 void __dev_remove_pack(struct packet_type *pt)
 423 {
 424         struct list_head *head = ptype_head(pt);
 425         struct packet_type *pt1;
 426
 427         spin_lock(&ptype_lock);
 428
 429         list_for_each_entry(pt1, head, list) {
 430                 if (pt == pt1) {
 431                         list_del_rcu(&pt->list);
 432                         goto out;
 433                 }
 434         }
 435
 436         pr_warn("dev_remove_pack: %p not found\n", pt);
 437 out:
 438         spin_unlock(&ptype_lock);
 439 }
 440 EXPORT_SYMBOL(__dev_remove_pack);
 441
 442 /**
 443  *      dev_remove_pack  - remove packet handler
 444  *      @pt: packet type declaration
 445  *
 446  *      Remove a protocol handler that was previously added to the kernel
 447  *      protocol handlers by dev_add_pack(). The passed &packet_type is removed
 448  *      from the kernel lists and can be freed or reused once this function
 449  *      returns.
 450  *
 451  *      This call sleeps to guarantee that no CPU is looking at the packet
 452  *      type after return.
 453  */
 454 void dev_remove_pack(struct packet_type *pt)
 455 {
 456         __dev_remove_pack(pt);
 457
 458         synchronize_net();
 459 }
 460 EXPORT_SYMBOL(dev_remove_pack);
 461
 462
 463 /**
 464  *      dev_add_offload - register offload handlers
 465  *      @po: protocol offload declaration
 466  *
 467  *      Add protocol offload handlers to the networking stack. The passed
 468  *      &proto_offload is linked into kernel lists and may not be freed until
 469  *      it has been removed from the kernel lists.
 470  *
 471  *      This call does not sleep therefore it can not
 472  *      guarantee all CPU's that are in middle of receiving packets
 473  *      will see the new offload handlers (until the next received packet).
 474  */
 475 void dev_add_offload(struct packet_offload *po)
 476 {
 477         struct packet_offload *elem;
 478
 479         spin_lock(&offload_lock);
 480         list_for_each_entry(elem, &offload_base, list) {
 481                 if (po->priority < elem->priority)
 482                         break;
 483         }
 484         list_add_rcu(&po->list, elem->list.prev);
 485         spin_unlock(&offload_lock);
 486 }
 487 EXPORT_SYMBOL(dev_add_offload);
 488
 489 /**
 490  *      __dev_remove_offload     - remove offload handler
 491  *      @po: packet offload declaration
 492  *
 493  *      Remove a protocol offload handler that was previously added to the
 494  *      kernel offload handlers by dev_add_offload(). The passed &offload_type
 495  *      is removed from the kernel lists and can be freed or reused once this
 496  *      function returns.
 497  *
 498  *      The packet type might still be in use by receivers
 499  *      and must not be freed until after all the CPU's have gone
 500  *      through a quiescent state.
 501  */
 502 static void __dev_remove_offload(struct packet_offload *po)
 503 {
 504         struct list_head *head = &offload_base;
 505         struct packet_offload *po1;
 506
 507         spin_lock(&offload_lock);
 508
 509         list_for_each_entry(po1, head, list) {
 510                 if (po == po1) {
 511                         list_del_rcu(&po->list);
 512                         goto out;
 513                 }
 514         }
 515
 516         pr_warn("dev_remove_offload: %p not found\n", po);
 517 out:
 518         spin_unlock(&offload_lock);
 519 }
 520
 521 /**
 522  *      dev_remove_offload       - remove packet offload handler
 523  *      @po: packet offload declaration
 524  *
 525  *      Remove a packet offload handler that was previously added to the kernel
 526  *      offload handlers by dev_add_offload(). The passed &offload_type is
 527  *      removed from the kernel lists and can be freed or reused once this
 528  *      function returns.
 529  *
 530  *      This call sleeps to guarantee that no CPU is looking at the packet
 531  *      type after return.
 532  */
 533 void dev_remove_offload(struct packet_offload *po)
 534 {
 535         __dev_remove_offload(po);
 536
 537         synchronize_net();
 538 }
 539 EXPORT_SYMBOL(dev_remove_offload);
 540
 541 /******************************************************************************
 542
 543                       Device Boot-time Settings Routines
 544
 545 *******************************************************************************/
 546
 547 /* Boot time configuration table */
 548 static struct netdev_boot_setup dev_boot_setup[NETDEV_BOOT_SETUP_MAX];
 549
 550 /**
 551  *      netdev_boot_setup_add   - add new setup entry
 552  *      @name: name of the device
 553  *      @map: configured settings for the device
 554  *
 555  *      Adds new setup entry to the dev_boot_setup list.  The function
 556  *      returns 0 on error and 1 on success.  This is a generic routine to
 557  *      all netdevices.
 558  */
 559 static int netdev_boot_setup_add(char *name, struct ifmap *map)
 560 {
 561         struct netdev_boot_setup *s;
 562         int i;
 563
 564         s = dev_boot_setup;
 565         for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++) {
 566                 if (s[i].name[0] == '\0' || s[i].name[0] == ' ') {
 567                         memset(s[i].name, 0, sizeof(s[i].name));
 568                         strlcpy(s[i].name, name, IFNAMSIZ);
 569                         memcpy(&s[i].map, map, sizeof(s[i].map));
 570                         break;
 571                 }
 572         }
 573
 574         return i >= NETDEV_BOOT_SETUP_MAX ? 0 : 1;
 575 }
 576
 577 /**
 578  *      netdev_boot_setup_check - check boot time settings
 579  *      @dev: the netdevice
 580  *
 581  *      Check boot time settings for the device.
 582  *      The found settings are set for the device to be used
 583  *      later in the device probing.
 584  *      Returns 0 if no settings found, 1 if they are.
 585  */
 586 int netdev_boot_setup_check(struct net_device *dev)
 587 {
 588         struct netdev_boot_setup *s = dev_boot_setup;
 589         int i;
 590
 591         for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++) {
 592                 if (s[i].name[0] != '\0' && s[i].name[0] != ' ' &&
 593                     !strcmp(dev->name, s[i].name)) {
 594                         dev->irq        = s[i].map.irq;
 595                         dev->base_addr  = s[i].map.base_addr;
 596                         dev->mem_start  = s[i].map.mem_start;
 597                         dev->mem_end    = s[i].map.mem_end;
 598                         return 1;
 599                 }
 600         }
 601         return 0;
 602 }
 603 EXPORT_SYMBOL(netdev_boot_setup_check);
 604
 605
 606 /**
 607  *      netdev_boot_base        - get address from boot time settings
 608  *      @prefix: prefix for network device
 609  *      @unit: id for network device
 610  *
 611  *      Check boot time settings for the base address of device.
 612  *      The found settings are set for the device to be used
 613  *      later in the device probing.
 614  *      Returns 0 if no settings found.
 615  */
 616 unsigned long netdev_boot_base(const char *prefix, int unit)
 617 {
 618         const struct netdev_boot_setup *s = dev_boot_setup;
 619         char name[IFNAMSIZ];
 620         int i;
 621
 622         sprintf(name, "%s%d", prefix, unit);
 623
 624         /*
 625          * If device already registered then return base of 1
 626          * to indicate not to probe for this interface
 627          */
 628         if (__dev_get_by_name(&init_net, name))
 629                 return 1;
 630
 631         for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++)
 632                 if (!strcmp(name, s[i].name))
 633                         return s[i].map.base_addr;
 634         return 0;
 635 }
 636
 637 /*
 638  * Saves at boot time configured settings for any netdevice.
 639  */
 640 int __init netdev_boot_setup(char *str)
 641 {
 642         int ints[5];
 643         struct ifmap map;
 644
 645         str = get_options(str, ARRAY_SIZE(ints), ints);
 646         if (!str || !*str)
 647                 return 0;
 648
 649         /* Save settings */
 650         memset(&map, 0, sizeof(map));
 651         if (ints[0] > 0)
 652                 map.irq = ints[1];
 653         if (ints[0] > 1)
 654                 map.base_addr = ints[2];
 655         if (ints[0] > 2)
 656                 map.mem_start = ints[3];
 657         if (ints[0] > 3)
 658                 map.mem_end = ints[4];
 659
 660         /* Add new entry to the list */
 661         return netdev_boot_setup_add(str, &map);
 662 }
 663
 664 __setup("netdev=", netdev_boot_setup);
 665
 666 /*******************************************************************************
 667
 668                             Device Interface Subroutines
 669
 670 *******************************************************************************/
 671
 672 /**
 673  *      dev_get_iflink  - get 'iflink' value of a interface
 674  *      @dev: targeted interface
 675  *
 676  *      Indicates the ifindex the interface is linked to.
 677  *      Physical interfaces have the same 'ifindex' and 'iflink' values.
 678  */
 679
 680 int dev_get_iflink(const struct net_device *dev)
 681 {
 682         if (dev->netdev_ops && dev->netdev_ops->ndo_get_iflink)
 683                 return dev->netdev_ops->ndo_get_iflink(dev);
 684
 685         return dev->ifindex;
 686 }
 687 EXPORT_SYMBOL(dev_get_iflink);
 688
 689 /**
 690  *      dev_fill_metadata_dst - Retrieve tunnel egress information.
 691  *      @dev: targeted interface
 692  *      @skb: The packet.
 693  *
 694  *      For better visibility of tunnel traffic OVS needs to retrieve
 695  *      egress tunnel information for a packet. Following API allows
 696  *      user to get this info.
 697  */
 698 int dev_fill_metadata_dst(struct net_device *dev, struct sk_buff *skb)
 699 {
 700         struct ip_tunnel_info *info;
 701
 702         if (!dev->netdev_ops  || !dev->netdev_ops->ndo_fill_metadata_dst)
 703                 return -EINVAL;
 704
 705         info = skb_tunnel_info_unclone(skb);
 706         if (!info)
 707                 return -ENOMEM;
 708         if (unlikely(!(info->mode & IP_TUNNEL_INFO_TX)))
 709                 return -EINVAL;
 710
 711         return dev->netdev_ops->ndo_fill_metadata_dst(dev, skb);
 712 }
 713 EXPORT_SYMBOL_GPL(dev_fill_metadata_dst);
 714
 715 /**
 716  *      __dev_get_by_name       - find a device by its name
 717  *      @net: the applicable net namespace
 718  *      @name: name to find
 719  *
 720  *      Find an interface by name. Must be called under RTNL semaphore
 721  *      or @dev_base_lock. If the name is found a pointer to the device
 722  *      is returned. If the name is not found then %NULL is returned. The
 723  *      reference counters are not incremented so the caller must be
 724  *      careful with locks.
 725  */
 726
 727 struct net_device *__dev_get_by_name(struct net *net, const char *name)
 728 {
 729         struct net_device *dev;
 730         struct hlist_head *head = dev_name_hash(net, name);
 731
 732         hlist_for_each_entry(dev, head, name_hlist)
 733                 if (!strncmp(dev->name, name, IFNAMSIZ))
 734                         return dev;
 735
 736         return NULL;
 737 }
 738 EXPORT_SYMBOL(__dev_get_by_name);
 739
 740 /**
 741  *      dev_get_by_name_rcu     - find a device by its name
 742  *      @net: the applicable net namespace
 743  *      @name: name to find
 744  *
 745  *      Find an interface by name.
 746  *      If the name is found a pointer to the device is returned.
 747  *      If the name is not found then %NULL is returned.
 748  *      The reference counters are not incremented so the caller must be
 749  *      careful with locks. The caller must hold RCU lock.
 750  */
 751
 752 struct net_device *dev_get_by_name_rcu(struct net *net, const char *name)
 753 {
 754         struct net_device *dev;
 755         struct hlist_head *head = dev_name_hash(net, name);
 756
 757         hlist_for_each_entry_rcu(dev, head, name_hlist)
 758                 if (!strncmp(dev->name, name, IFNAMSIZ))
 759                         return dev;
 760
 761         return NULL;
 762 }
 763 EXPORT_SYMBOL(dev_get_by_name_rcu);
 764
 765 /**
 766  *      dev_get_by_name         - find a device by its name
 767  *      @net: the applicable net namespace
 768  *      @name: name to find
 769  *
 770  *      Find an interface by name. This can be called from any
 771  *      context and does its own locking. The returned handle has
 772  *      the usage count incremented and the caller must use dev_put() to
 773  *      release it when it is no longer needed. %NULL is returned if no
 774  *      matching device is found.
 775  */
 776
 777 struct net_device *dev_get_by_name(struct net *net, const char *name)
 778 {
 779         struct net_device *dev;
 780
 781         rcu_read_lock();
 782         dev = dev_get_by_name_rcu(net, name);
 783         if (dev)
 784                 dev_hold(dev);
 785         rcu_read_unlock();
 786         return dev;
 787 }
 788 EXPORT_SYMBOL(dev_get_by_name);
 789
 790 /**
 791  *      __dev_get_by_index - find a device by its ifindex
 792  *      @net: the applicable net namespace
 793  *      @ifindex: index of device
 794  *
 795  *      Search for an interface by index. Returns %NULL if the device
 796  *      is not found or a pointer to the device. The device has not
 797  *      had its reference counter increased so the caller must be careful
 798  *      about locking. The caller must hold either the RTNL semaphore
 799  *      or @dev_base_lock.
 800  */
 801
 802 struct net_device *__dev_get_by_index(struct net *net, int ifindex)
 803 {
 804         struct net_device *dev;
 805         struct hlist_head *head = dev_index_hash(net, ifindex);
 806
 807         hlist_for_each_entry(dev, head, index_hlist)
 808                 if (dev->ifindex == ifindex)
 809                         return dev;
 810
 811         return NULL;
 812 }
 813 EXPORT_SYMBOL(__dev_get_by_index);
 814
 815 /**
 816  *      dev_get_by_index_rcu - find a device by its ifindex
 817  *      @net: the applicable net namespace
 818  *      @ifindex: index of device
 819  *
 820  *      Search for an interface by index. Returns %NULL if the device
 821  *      is not found or a pointer to the device. The device has not
 822  *      had its reference counter increased so the caller must be careful
 823  *      about locking. The caller must hold RCU lock.
 824  */
 825
 826 struct net_device *dev_get_by_index_rcu(struct net *net, int ifindex)
 827 {
 828         struct net_device *dev;
 829         struct hlist_head *head = dev_index_hash(net, ifindex);
 830
 831         hlist_for_each_entry_rcu(dev, head, index_hlist)
 832                 if (dev->ifindex == ifindex)
 833                         return dev;
 834
 835         return NULL;
 836 }
 837 EXPORT_SYMBOL(dev_get_by_index_rcu);
 838
 839
 840 /**
 841  *      dev_get_by_index - find a device by its ifindex
 842  *      @net: the applicable net namespace
 843  *      @ifindex: index of device
 844  *
 845  *      Search for an interface by index. Returns NULL if the device
 846  *      is not found or a pointer to the device. The device returned has
 847  *      had a reference added and the pointer is safe until the user calls
 848  *      dev_put to indicate they have finished with it.
 849  */
 850
 851 struct net_device *dev_get_by_index(struct net *net, int ifindex)
 852 {
 853         struct net_device *dev;
 854
 855         rcu_read_lock();
 856         dev = dev_get_by_index_rcu(net, ifindex);
 857         if (dev)
 858                 dev_hold(dev);
 859         rcu_read_unlock();
 860         return dev;
 861 }
 862 EXPORT_SYMBOL(dev_get_by_index);
 863
 864 /**
 865  *      netdev_get_name - get a netdevice name, knowing its ifindex.
 866  *      @net: network namespace
 867  *      @name: a pointer to the buffer where the name will be stored.
 868  *      @ifindex: the ifindex of the interface to get the name from.
 869  *
 870  *      The use of raw_seqcount_begin() and cond_resched() before
 871  *      retrying is required as we want to give the writers a chance
 872  *      to complete when CONFIG_PREEMPT is not set.
 873  */
 874 int netdev_get_name(struct net *net, char *name, int ifindex)
 875 {
 876         struct net_device *dev;
 877         unsigned int seq;
 878
 879 retry:
 880         seq = raw_seqcount_begin(&devnet_rename_seq);
 881         rcu_read_lock();
 882         dev = dev_get_by_index_rcu(net, ifindex);
 883         if (!dev) {
 884                 rcu_read_unlock();
 885                 return -ENODEV;
 886         }
 887
 888         strcpy(name, dev->name);
 889         rcu_read_unlock();
 890         if (read_seqcount_retry(&devnet_rename_seq, seq)) {
 891                 cond_resched();
 892                 goto retry;
 893         }
 894
 895         return 0;
 896 }
 897
 898 /**
 899  *      dev_getbyhwaddr_rcu - find a device by its hardware address
 900  *      @net: the applicable net namespace
 901  *      @type: media type of device
 902  *      @ha: hardware address
 903  *
 904  *      Search for an interface by MAC address. Returns NULL if the device
 905  *      is not found or a pointer to the device.
 906  *      The caller must hold RCU or RTNL.
 907  *      The returned device has not had its ref count increased
 908  *      and the caller must therefore be careful about locking
 909  *
 910  */
 911
 912 struct net_device *dev_getbyhwaddr_rcu(struct net *net, unsigned short type,
 913                                        const char *ha)
 914 {
 915         struct net_device *dev;
 916
 917         for_each_netdev_rcu(net, dev)
 918                 if (dev->type == type &&
 919                     !memcmp(dev->dev_addr, ha, dev->addr_len))
 920                         return dev;
 921
 922         return NULL;
 923 }
 924 EXPORT_SYMBOL(dev_getbyhwaddr_rcu);
 925
 926 struct net_device *__dev_getfirstbyhwtype(struct net *net, unsigned short type)
 927 {
 928         struct net_device *dev;
 929
 930         ASSERT_RTNL();
 931         for_each_netdev(net, dev)
 932                 if (dev->type == type)
 933                         return dev;
 934
 935         return NULL;
 936 }
 937 EXPORT_SYMBOL(__dev_getfirstbyhwtype);
 938
 939 struct net_device *dev_getfirstbyhwtype(struct net *net, unsigned short type)
 940 {
 941         struct net_device *dev, *ret = NULL;
 942
 943         rcu_read_lock();
 944         for_each_netdev_rcu(net, dev)
 945                 if (dev->type == type) {
 946                         dev_hold(dev);
 947                         ret = dev;
 948                         break;
 949                 }
 950         rcu_read_unlock();
 951         return ret;
 952 }
 953 EXPORT_SYMBOL(dev_getfirstbyhwtype);
 954
 955 /**
 956  *      __dev_get_by_flags - find any device with given flags
 957  *      @net: the applicable net namespace
 958  *      @if_flags: IFF_* values
 959  *      @mask: bitmask of bits in if_flags to check
 960  *
 961  *      Search for any interface with the given flags. Returns NULL if a device
 962  *      is not found or a pointer to the device. Must be called inside
 963  *      rtnl_lock(), and result refcount is unchanged.
 964  */
 965
 966 struct net_device *__dev_get_by_flags(struct net *net, unsigned short if_flags,
 967                                       unsigned short mask)
 968 {
 969         struct net_device *dev, *ret;
 970
 971         ASSERT_RTNL();
 972
 973         ret = NULL;
 974         for_each_netdev(net, dev) {
 975                 if (((dev->flags ^ if_flags) & mask) == 0) {
 976                         ret = dev;
 977                         break;
 978                 }
 979         }
 980         return ret;
 981 }
 982 EXPORT_SYMBOL(__dev_get_by_flags);
 983
 984 /**
 985  *      dev_valid_name - check if name is okay for network device
 986  *      @name: name string
 987  *
 988  *      Network device names need to be valid file names to
 989  *      to allow sysfs to work.  We also disallow any kind of
 990  *      whitespace.
 991  */
 992 bool dev_valid_name(const char *name)
 993 {
 994         if (*name == '\0')
 995                 return false;
 996         if (strnlen(name, IFNAMSIZ) == IFNAMSIZ)
 997                 return false;
 998         if (!strcmp(name, ".") || !strcmp(name, ".."))
 999                 return false;
1000
1001         while (*name) {
1002                 if (*name == '/' || *name == ':' || isspace(*name))
1003                         return false;
1004                 name++;
1005         }
1006         return true;
1007 }
1008 EXPORT_SYMBOL(dev_valid_name);
1009
1010 /**
1011  *      __dev_alloc_name - allocate a name for a device
1012  *      @net: network namespace to allocate the device name in
1013  *      @name: name format string
1014  *      @buf:  scratch buffer and result name string
1015  *
1016  *      Passed a format string - eg "lt%d" it will try and find a suitable
1017  *      id. It scans list of devices to build up a free map, then chooses
1018  *      the first empty slot. The caller must hold the dev_base or rtnl lock
1019  *      while allocating the name and adding the device in order to avoid
1020  *      duplicates.
1021  *      Limited to bits_per_byte * page size devices (ie 32K on most platforms).
1022  *      Returns the number of the unit assigned or a negative errno code.
1023  */
1024
1025 static int __dev_alloc_name(struct net *net, const char *name, char *buf)
1026 {
1027         int i = 0;
1028         const char *p;
1029         const int max_netdevices = 8*PAGE_SIZE;
1030         unsigned long *inuse;
1031         struct net_device *d;
1032
1033         p = strnchr(name, IFNAMSIZ-1, '%');
1034         if (p) {
1035                 /*
1036                  * Verify the string as this thing may have come from
1037                  * the user.  There must be either one "%d" and no other "%"
1038                  * characters.
1039                  */
1040                 if (p[1] != 'd' || strchr(p + 2, '%'))
1041                         return -EINVAL;
1042
1043                 /* Use one page as a bit array of possible slots */
1044                 inuse = (unsigned long *) get_zeroed_page(GFP_ATOMIC);
1045                 if (!inuse)
1046                         return -ENOMEM;
1047
1048                 for_each_netdev(net, d) {
1049                         if (!sscanf(d->name, name, &i))
1050                                 continue;
1051                         if (i < 0 || i >= max_netdevices)
1052                                 continue;
1053
1054                         /*  avoid cases where sscanf is not exact inverse of printf */
1055                         snprintf(buf, IFNAMSIZ, name, i);
1056                         if (!strncmp(buf, d->name, IFNAMSIZ))
1057                                 set_bit(i, inuse);
1058                 }
1059
1060                 i = find_first_zero_bit(inuse, max_netdevices);
1061                 free_page((unsigned long) inuse);
1062         }
1063
1064         if (buf != name)
1065                 snprintf(buf, IFNAMSIZ, name, i);
1066         if (!__dev_get_by_name(net, buf))
1067                 return i;
1068
1069         /* It is possible to run out of possible slots
1070          * when the name is long and there isn't enough space left
1071          * for the digits, or if all bits are used.
1072          */
1073         return -ENFILE;
1074 }
1075
1076 /**
1077  *      dev_alloc_name - allocate a name for a device
1078  *      @dev: device
1079  *      @name: name format string
1080  *
1081  *      Passed a format string - eg "lt%d" it will try and find a suitable
1082  *      id. It scans list of devices to build up a free map, then chooses
1083  *      the first empty slot. The caller must hold the dev_base or rtnl lock
1084  *      while allocating the name and adding the device in order to avoid
1085  *      duplicates.
1086  *      Limited to bits_per_byte * page size devices (ie 32K on most platforms).
1087  *      Returns the number of the unit assigned or a negative errno code.
1088  */
1089
1090 int dev_alloc_name(struct net_device *dev, const char *name)
1091 {
1092         char buf[IFNAMSIZ];
1093         struct net *net;
1094         int ret;
1095
1096         BUG_ON(!dev_net(dev));
1097         net = dev_net(dev);
1098         ret = __dev_alloc_name(net, name, buf);
1099         if (ret >= 0)
1100                 strlcpy(dev->name, buf, IFNAMSIZ);
1101         return ret;
1102 }
1103 EXPORT_SYMBOL(dev_alloc_name);
1104
1105 static int dev_alloc_name_ns(struct net *net,
1106                              struct net_device *dev,
1107                              const char *name)
1108 {
1109         char buf[IFNAMSIZ];
1110         int ret;
1111
1112         ret = __dev_alloc_name(net, name, buf);
1113         if (ret >= 0)
1114                 strlcpy(dev->name, buf, IFNAMSIZ);
1115         return ret;
1116 }
1117
1118 int dev_get_valid_name(struct net *net, struct net_device *dev,
1119                        const char *name)
1120 {
1121         BUG_ON(!net);
1122
1123         if (!dev_valid_name(name))
1124                 return -EINVAL;
1125
1126         if (strchr(name, '%'))
1127                 return dev_alloc_name_ns(net, dev, name);
1128         else if (__dev_get_by_name(net, name))
1129                 return -EEXIST;
1130         else if (dev->name != name)
1131                 strlcpy(dev->name, name, IFNAMSIZ);
1132
1133         return 0;
1134 }
1135 EXPORT_SYMBOL(dev_get_valid_name);
1136
1137 /**
1138  *      dev_change_name - change name of a device
1139  *      @dev: device
1140  *      @newname: name (or format string) must be at least IFNAMSIZ
1141  *
1142  *      Change name of a device, can pass format strings "eth%d".
1143  *      for wildcarding.
1144  */
1145 int dev_change_name(struct net_device *dev, const char *newname)
1146 {
1147         unsigned char old_assign_type;
1148         char oldname[IFNAMSIZ];
1149         int err = 0;
1150         int ret;
1151         struct net *net;
1152
1153         ASSERT_RTNL();
1154         BUG_ON(!dev_net(dev));
1155
1156         net = dev_net(dev);
1157         if (dev->flags & IFF_UP)
1158                 return -EBUSY;
1159
1160         write_seqcount_begin(&devnet_rename_seq);
1161
1162         if (strncmp(newname, dev->name, IFNAMSIZ) == 0) {
1163                 write_seqcount_end(&devnet_rename_seq);
1164                 return 0;
1165         }
1166
1167         memcpy(oldname, dev->name, IFNAMSIZ);
1168
1169         err = dev_get_valid_name(net, dev, newname);
1170         if (err < 0) {
1171                 write_seqcount_end(&devnet_rename_seq);
1172                 return err;
1173         }
1174
1175         if (oldname[0] && !strchr(oldname, '%'))
1176                 netdev_info(dev, "renamed from %s\n", oldname);
1177
1178         old_assign_type = dev->name_assign_type;
1179         dev->name_assign_type = NET_NAME_RENAMED;
1180
1181 rollback:
1182         ret = device_rename(&dev->dev, dev->name);
1183         if (ret) {
1184                 memcpy(dev->name, oldname, IFNAMSIZ);
1185                 dev->name_assign_type = old_assign_type;
1186                 write_seqcount_end(&devnet_rename_seq);
1187                 return ret;
1188         }
1189
1190         write_seqcount_end(&devnet_rename_seq);
1191
1192         netdev_adjacent_rename_links(dev, oldname);
1193
1194         write_lock_bh(&dev_base_lock);
1195         hlist_del_rcu(&dev->name_hlist);
1196         write_unlock_bh(&dev_base_lock);
1197
1198         synchronize_rcu();
1199
1200         write_lock_bh(&dev_base_lock);
1201         hlist_add_head_rcu(&dev->name_hlist, dev_name_hash(net, dev->name));
1202         write_unlock_bh(&dev_base_lock);
1203
1204         ret = call_netdevice_notifiers(NETDEV_CHANGENAME, dev);
1205         ret = notifier_to_errno(ret);
1206
1207         if (ret) {
1208                 /* err >= 0 after dev_alloc_name() or stores the first errno */
1209                 if (err >= 0) {
1210                         err = ret;
1211                         write_seqcount_begin(&devnet_rename_seq);
1212                         memcpy(dev->name, oldname, IFNAMSIZ);
1213                         memcpy(oldname, newname, IFNAMSIZ);
1214                         dev->name_assign_type = old_assign_type;
1215                         old_assign_type = NET_NAME_RENAMED;
1216                         goto rollback;
1217                 } else {
1218                         pr_err("%s: name change rollback failed: %d\n",
1219                                dev->name, ret);
1220                 }
1221         }
1222
1223         return err;
1224 }
1225
1226 /**
1227  *      dev_set_alias - change ifalias of a device
1228  *      @dev: device
1229  *      @alias: name up to IFALIASZ
1230  *      @len: limit of bytes to copy from info
1231  *
1232  *      Set ifalias for a device,
1233  */
1234 int dev_set_alias(struct net_device *dev, const char *alias, size_t len)
1235 {
1236         char *new_ifalias;
1237
1238         ASSERT_RTNL();
1239
1240         if (len >= IFALIASZ)
1241                 return -EINVAL;
1242
1243         if (!len) {
1244                 kfree(dev->ifalias);
1245                 dev->ifalias = NULL;
1246                 return 0;
1247         }
1248
1249         new_ifalias = krealloc(dev->ifalias, len + 1, GFP_KERNEL);
1250         if (!new_ifalias)
1251                 return -ENOMEM;
1252         dev->ifalias = new_ifalias;
1253         memcpy(dev->ifalias, alias, len);
1254         dev->ifalias[len] = 0;
1255
1256         return len;
1257 }
1258
1259
1260 /**
1261  *      netdev_features_change - device changes features
1262  *      @dev: device to cause notification
1263  *
1264  *      Called to indicate a device has changed features.
1265  */
1266 void netdev_features_change(struct net_device *dev)
1267 {
1268         call_netdevice_notifiers(NETDEV_FEAT_CHANGE, dev);
1269 }
1270 EXPORT_SYMBOL(netdev_features_change);
1271
1272 /**
1273  *      netdev_state_change - device changes state
1274  *      @dev: device to cause notification
1275  *
1276  *      Called to indicate a device has changed state. This function calls
1277  *      the notifier chains for netdev_chain and sends a NEWLINK message
1278  *      to the routing socket.
1279  */
1280 void netdev_state_change(struct net_device *dev)
1281 {
1282         if (dev->flags & IFF_UP) {
1283                 struct netdev_notifier_change_info change_info;
1284
1285                 change_info.flags_changed = 0;
1286                 call_netdevice_notifiers_info(NETDEV_CHANGE, dev,
1287                                               &change_info.info);
1288                 rtmsg_ifinfo(RTM_NEWLINK, dev, 0, GFP_KERNEL);
1289         }
1290 }
1291 EXPORT_SYMBOL(netdev_state_change);
1292
1293 /**
1294  *      netdev_notify_peers - notify network peers about existence of @dev
1295  *      @dev: network device
1296  *
1297  * Generate traffic such that interested network peers are aware of
1298  * @dev, such as by generating a gratuitous ARP. This may be used when
1299  * a device wants to inform the rest of the network about some sort of
1300  * reconfiguration such as a failover event or virtual machine
1301  * migration.
1302  */
1303 void netdev_notify_peers(struct net_device *dev)
1304 {
1305         rtnl_lock();
1306         call_netdevice_notifiers(NETDEV_NOTIFY_PEERS, dev);
1307         call_netdevice_notifiers(NETDEV_RESEND_IGMP, dev);
1308         rtnl_unlock();
1309 }
1310 EXPORT_SYMBOL(netdev_notify_peers);
1311
1312 static int __dev_open(struct net_device *dev)
1313 {
1314         const struct net_device_ops *ops = dev->netdev_ops;
1315         int ret;
1316
1317         ASSERT_RTNL();
1318
1319         if (!netif_device_present(dev))
1320                 return -ENODEV;
1321
1322         /* Block netpoll from trying to do any rx path servicing.
1323          * If we don't do this there is a chance ndo_poll_controller
1324          * or ndo_poll may be running while we open the device
1325          */
1326         netpoll_poll_disable(dev);
1327
1328         ret = call_netdevice_notifiers(NETDEV_PRE_UP, dev);
1329         ret = notifier_to_errno(ret);
1330         if (ret)
1331                 return ret;
1332
1333         set_bit(__LINK_STATE_START, &dev->state);
1334
1335         if (ops->ndo_validate_addr)
1336                 ret = ops->ndo_validate_addr(dev);
1337
1338         if (!ret && ops->ndo_open)
1339                 ret = ops->ndo_open(dev);
1340
1341         netpoll_poll_enable(dev);
1342
1343         if (ret)
1344                 clear_bit(__LINK_STATE_START, &dev->state);
1345         else {
1346                 dev->flags |= IFF_UP;
1347                 dev_set_rx_mode(dev);
1348                 dev_activate(dev);
1349                 add_device_randomness(dev->dev_addr, dev->addr_len);
1350         }
1351
1352         return ret;
1353 }
1354
1355 /**
1356  *      dev_open        - prepare an interface for use.
1357  *      @dev:   device to open
1358  *
1359  *      Takes a device from down to up state. The device's private open
1360  *      function is invoked and then the multicast lists are loaded. Finally
1361  *      the device is moved into the up state and a %NETDEV_UP message is
1362  *      sent to the netdev notifier chain.
1363  *
1364  *      Calling this function on an active interface is a nop. On a failure
1365  *      a negative errno code is returned.
1366  */
1367 int dev_open(struct net_device *dev)
1368 {
1369         int ret;
1370
1371         if (dev->flags & IFF_UP)
1372                 return 0;
1373
1374         ret = __dev_open(dev);
1375         if (ret < 0)
1376                 return ret;
1377
1378         rtmsg_ifinfo(RTM_NEWLINK, dev, IFF_UP|IFF_RUNNING, GFP_KERNEL);
1379         call_netdevice_notifiers(NETDEV_UP, dev);
1380
1381         return ret;
1382 }
1383 EXPORT_SYMBOL(dev_open);
1384
1385 static int __dev_close_many(struct list_head *head)
1386 {
1387         struct net_device *dev;
1388
1389         ASSERT_RTNL();
1390         might_sleep();
1391
1392         list_for_each_entry(dev, head, close_list) {
1393                 /* Temporarily disable netpoll until the interface is down */
1394                 netpoll_poll_disable(dev);
1395
1396                 call_netdevice_notifiers(NETDEV_GOING_DOWN, dev);
1397
1398                 clear_bit(__LINK_STATE_START, &dev->state);
1399
1400                 /* Synchronize to scheduled poll. We cannot touch poll list, it
1401                  * can be even on different cpu. So just clear netif_running().
1402                  *
1403                  * dev->stop() will invoke napi_disable() on all of it's
1404                  * napi_struct instances on this device.
1405                  */
1406                 smp_mb__after_atomic(); /* Commit netif_running(). */
1407         }
1408
1409         dev_deactivate_many(head);
1410
1411         list_for_each_entry(dev, head, close_list) {
1412                 const struct net_device_ops *ops = dev->netdev_ops;
1413
1414                 /*
1415                  *      Call the device specific close. This cannot fail.
1416                  *      Only if device is UP
1417                  *
1418                  *      We allow it to be called even after a DETACH hot-plug
1419                  *      event.
1420                  */
1421                 if (ops->ndo_stop)
1422                         ops->ndo_stop(dev);
1423
1424                 dev->flags &= ~IFF_UP;
1425                 netpoll_poll_enable(dev);
1426         }
1427
1428         return 0;
1429 }
1430
1431 static int __dev_close(struct net_device *dev)
1432 {
1433         int retval;
1434         LIST_HEAD(single);
1435
1436         list_add(&dev->close_list, &single);
1437         retval = __dev_close_many(&single);
1438         list_del(&single);
1439
1440         return retval;
1441 }
1442
1443 int dev_close_many(struct list_head *head, bool unlink)
1444 {
1445         struct net_device *dev, *tmp;
1446
1447         /* Remove the devices that don't need to be closed */
1448         list_for_each_entry_safe(dev, tmp, head, close_list)
1449                 if (!(dev->flags & IFF_UP))
1450                         list_del_init(&dev->close_list);
1451
1452         __dev_close_many(head);
1453
1454         list_for_each_entry_safe(dev, tmp, head, close_list) {
1455                 rtmsg_ifinfo(RTM_NEWLINK, dev, IFF_UP|IFF_RUNNING, GFP_KERNEL);
1456                 call_netdevice_notifiers(NETDEV_DOWN, dev);
1457                 if (unlink)
1458                         list_del_init(&dev->close_list);
1459         }
1460
1461         return 0;
1462 }
1463 EXPORT_SYMBOL(dev_close_many);
1464
1465 /**
1466  *      dev_close - shutdown an interface.
1467  *      @dev: device to shutdown
1468  *
1469  *      This function moves an active device into down state. A
1470  *      %NETDEV_GOING_DOWN is sent to the netdev notifier chain. The device
1471  *      is then deactivated and finally a %NETDEV_DOWN is sent to the notifier
1472  *      chain.
1473  */
1474 int dev_close(struct net_device *dev)
1475 {
1476         if (dev->flags & IFF_UP) {
1477                 LIST_HEAD(single);
1478
1479                 list_add(&dev->close_list, &single);
1480                 dev_close_many(&single, true);
1481                 list_del(&single);
1482         }
1483         return 0;
1484 }
1485 EXPORT_SYMBOL(dev_close);
1486
1487
1488 /**
1489  *      dev_disable_lro - disable Large Receive Offload on a device
1490  *      @dev: device
1491  *
1492  *      Disable Large Receive Offload (LRO) on a net device.  Must be
1493  *      called under RTNL.  This is needed if received packets may be
1494  *      forwarded to another interface.
1495  */
1496 void dev_disable_lro(struct net_device *dev)
1497 {
1498         struct net_device *lower_dev;
1499         struct list_head *iter;
1500
1501         dev->wanted_features &= ~NETIF_F_LRO;
1502         netdev_update_features(dev);
1503
1504         if (unlikely(dev->features & NETIF_F_LRO))
1505                 netdev_WARN(dev, "failed to disable LRO!\n");
1506
1507         netdev_for_each_lower_dev(dev, lower_dev, iter)
1508                 dev_disable_lro(lower_dev);
1509 }
1510 EXPORT_SYMBOL(dev_disable_lro);
1511
1512 static int call_netdevice_notifier(struct notifier_block *nb, unsigned long val,
1513                                    struct net_device *dev)
1514 {
1515         struct netdev_notifier_info info;
1516
1517         netdev_notifier_info_init(&info, dev);
1518         return nb->notifier_call(nb, val, &info);
1519 }
1520
1521 static int dev_boot_phase = 1;
1522
1523 /**
1524  *      register_netdevice_notifier - register a network notifier block
1525  *      @nb: notifier
1526  *
1527  *      Register a notifier to be called when network device events occur.
1528  *      The notifier passed is linked into the kernel structures and must
1529  *      not be reused until it has been unregistered. A negative errno code
1530  *      is returned on a failure.
1531  *
1532  *      When registered all registration and up events are replayed
1533  *      to the new notifier to allow device to have a race free
1534  *      view of the network device list.
1535  */
1536
1537 int register_netdevice_notifier(struct notifier_block *nb)
1538 {
1539         struct net_device *dev;
1540         struct net_device *last;
1541         struct net *net;
1542         int err;
1543
1544         rtnl_lock();
1545         err = raw_notifier_chain_register(&netdev_chain, nb);
1546         if (err)
1547                 goto unlock;
1548         if (dev_boot_phase)
1549                 goto unlock;
1550         for_each_net(net) {
1551                 for_each_netdev(net, dev) {
1552                         err = call_netdevice_notifier(nb, NETDEV_REGISTER, dev);
1553                         err = notifier_to_errno(err);
1554                         if (err)
1555                                 goto rollback;
1556
1557                         if (!(dev->flags & IFF_UP))
1558                                 continue;
1559
1560                         call_netdevice_notifier(nb, NETDEV_UP, dev);
1561                 }
1562         }
1563
1564 unlock:
1565         rtnl_unlock();
1566         return err;
1567
1568 rollback:
1569         last = dev;
1570         for_each_net(net) {
1571                 for_each_netdev(net, dev) {
1572                         if (dev == last)
1573                                 goto outroll;
1574
1575                         if (dev->flags & IFF_UP) {
1576                                 call_netdevice_notifier(nb, NETDEV_GOING_DOWN,
1577                                                         dev);
1578                                 call_netdevice_notifier(nb, NETDEV_DOWN, dev);
1579                         }
1580                         call_netdevice_notifier(nb, NETDEV_UNREGISTER, dev);
1581                 }
1582         }
1583
1584 outroll:
1585         raw_notifier_chain_unregister(&netdev_chain, nb);
1586         goto unlock;
1587 }
1588 EXPORT_SYMBOL(register_netdevice_notifier);
1589
1590 /**
1591  *      unregister_netdevice_notifier - unregister a network notifier block
1592  *      @nb: notifier
1593  *
1594  *      Unregister a notifier previously registered by
1595  *      register_netdevice_notifier(). The notifier is unlinked into the
1596  *      kernel structures and may then be reused. A negative errno code
1597  *      is returned on a failure.
1598  *
1599  *      After unregistering unregister and down device events are synthesized
1600  *      for all devices on the device list to the removed notifier to remove
1601  *      the need for special case cleanup code.
1602  */
1603
1604 int unregister_netdevice_notifier(struct notifier_block *nb)
1605 {
1606         struct net_device *dev;
1607         struct net *net;
1608         int err;
1609
1610         rtnl_lock();
1611         err = raw_notifier_chain_unregister(&netdev_chain, nb);
1612         if (err)
1613                 goto unlock;
1614
1615         for_each_net(net) {
1616                 for_each_netdev(net, dev) {
1617                         if (dev->flags & IFF_UP) {
1618                                 call_netdevice_notifier(nb, NETDEV_GOING_DOWN,
1619                                                         dev);
1620                                 call_netdevice_notifier(nb, NETDEV_DOWN, dev);
1621                         }
1622                         call_netdevice_notifier(nb, NETDEV_UNREGISTER, dev);
1623                 }
1624         }
1625 unlock:
1626         rtnl_unlock();
1627         return err;
1628 }
1629 EXPORT_SYMBOL(unregister_netdevice_notifier);
1630
1631 /**
1632  *      call_netdevice_notifiers_info - call all network notifier blocks
1633  *      @val: value passed unmodified to notifier function
1634  *      @dev: net_device pointer passed unmodified to notifier function
1635  *      @info: notifier information data
1636  *
1637  *      Call all network notifier blocks.  Parameters and return value
1638  *      are as for raw_notifier_call_chain().
1639  */
1640
1641 static int call_netdevice_notifiers_info(unsigned long val,
1642                                          struct net_device *dev,
1643                                          struct netdev_notifier_info *info)
1644 {
1645         ASSERT_RTNL();
1646         netdev_notifier_info_init(info, dev);
1647         return raw_notifier_call_chain(&netdev_chain, val, info);
1648 }
1649
1650 /**
1651  *      call_netdevice_notifiers - call all network notifier blocks
1652  *      @val: value passed unmodified to notifier function
1653  *      @dev: net_device pointer passed unmodified to notifier function
1654  *
1655  *      Call all network notifier blocks.  Parameters and return value
1656  *      are as for raw_notifier_call_chain().
1657  */
1658
1659 int call_netdevice_notifiers(unsigned long val, struct net_device *dev)
1660 {
1661         struct netdev_notifier_info info;
1662
1663         return call_netdevice_notifiers_info(val, dev, &info);
1664 }
1665 EXPORT_SYMBOL(call_netdevice_notifiers);
1666
1667 /**
1668  *      call_netdevice_notifiers_mtu - call all network notifier blocks
1669  *      @val: value passed unmodified to notifier function
1670  *      @dev: net_device pointer passed unmodified to notifier function
1671  *      @arg: additional u32 argument passed to the notifier function
1672  *
1673  *      Call all network notifier blocks.  Parameters and return value
1674  *      are as for raw_notifier_call_chain().
1675  */
1676 static int call_netdevice_notifiers_mtu(unsigned long val,
1677                                         struct net_device *dev, u32 arg)
1678 {
1679         struct netdev_notifier_info_ext info = {
1680                 .info.dev = dev,
1681                 .ext.mtu = arg,
1682         };
1683
1684         BUILD_BUG_ON(offsetof(struct netdev_notifier_info_ext, info) != 0);
1685
1686         return call_netdevice_notifiers_info(val, dev, &info.info);
1687 }
1688
1689 #ifdef CONFIG_NET_INGRESS
1690 static struct static_key ingress_needed __read_mostly;
1691
1692 void net_inc_ingress_queue(void)
1693 {
1694         static_key_slow_inc(&ingress_needed);
1695 }
1696 EXPORT_SYMBOL_GPL(net_inc_ingress_queue);
1697
1698 void net_dec_ingress_queue(void)
1699 {
1700         static_key_slow_dec(&ingress_needed);
1701 }
1702 EXPORT_SYMBOL_GPL(net_dec_ingress_queue);
1703 #endif
1704
1705 #ifdef CONFIG_NET_EGRESS
1706 static struct static_key egress_needed __read_mostly;
1707
1708 void net_inc_egress_queue(void)
1709 {
1710         static_key_slow_inc(&egress_needed);
1711 }
1712 EXPORT_SYMBOL_GPL(net_inc_egress_queue);
1713
1714 void net_dec_egress_queue(void)
1715 {
1716         static_key_slow_dec(&egress_needed);
1717 }
1718 EXPORT_SYMBOL_GPL(net_dec_egress_queue);
1719 #endif
1720
1721 static struct static_key netstamp_needed __read_mostly;
1722 #ifdef HAVE_JUMP_LABEL
1723 static atomic_t netstamp_needed_deferred;
1724 static atomic_t netstamp_wanted;
1725 static void netstamp_clear(struct work_struct *work)
1726 {
1727         int deferred = atomic_xchg(&netstamp_needed_deferred, 0);
1728         int wanted;
1729
1730         wanted = atomic_add_return(deferred, &netstamp_wanted);
1731         if (wanted > 0)
1732                 static_key_enable(&netstamp_needed);
1733         else
1734                 static_key_disable(&netstamp_needed);
1735 }
1736 static DECLARE_WORK(netstamp_work, netstamp_clear);
1737 #endif
1738
1739 void net_enable_timestamp(void)
1740 {
1741 #ifdef HAVE_JUMP_LABEL
1742         int wanted;
1743
1744         while (1) {
1745                 wanted = atomic_read(&netstamp_wanted);
1746                 if (wanted <= 0)
1747                         break;
1748                 if (atomic_cmpxchg(&netstamp_wanted, wanted, wanted + 1) == wanted)
1749                         return;
1750         }
1751         atomic_inc(&netstamp_needed_deferred);
1752         schedule_work(&netstamp_work);
1753 #else
1754         static_key_slow_inc(&netstamp_needed);
1755 #endif
1756 }
1757 EXPORT_SYMBOL(net_enable_timestamp);
1758
1759 void net_disable_timestamp(void)
1760 {
1761 #ifdef HAVE_JUMP_LABEL
1762         int wanted;
1763
1764         while (1) {
1765                 wanted = atomic_read(&netstamp_wanted);
1766                 if (wanted <= 1)
1767                         break;
1768                 if (atomic_cmpxchg(&netstamp_wanted, wanted, wanted - 1) == wanted)
1769                         return;
1770         }
1771         atomic_dec(&netstamp_needed_deferred);
1772         schedule_work(&netstamp_work);
1773 #else
1774         static_key_slow_dec(&netstamp_needed);
1775 #endif
1776 }
1777 EXPORT_SYMBOL(net_disable_timestamp);
1778
1779 static inline void net_timestamp_set(struct sk_buff *skb)
1780 {
1781         skb->tstamp.tv64 = 0;
1782         if (static_key_false(&netstamp_needed))
1783                 __net_timestamp(skb);
1784 }
1785
1786 #define net_timestamp_check(COND, SKB)                  \
1787         if (static_key_false(&netstamp_needed)) {               \
1788                 if ((COND) && !(SKB)->tstamp.tv64)      \
1789                         __net_timestamp(SKB);           \
1790         }                                               \
1791
1792 bool is_skb_forwardable(const struct net_device *dev, const struct sk_buff *skb)
1793 {
1794         unsigned int len;
1795
1796         if (!(dev->flags & IFF_UP))
1797                 return false;
1798
1799         len = dev->mtu + dev->hard_header_len + VLAN_HLEN;
1800         if (skb->len <= len)
1801                 return true;
1802
1803         /* if TSO is enabled, we don't care about the length as the packet
1804          * could be forwarded without being segmented before
1805          */
1806         if (skb_is_gso(skb))
1807                 return true;
1808
1809         return false;
1810 }
1811 EXPORT_SYMBOL_GPL(is_skb_forwardable);
1812
1813 int __dev_forward_skb(struct net_device *dev, struct sk_buff *skb)
1814 {
1815         int ret = ____dev_forward_skb(dev, skb);
1816
1817         if (likely(!ret)) {
1818                 skb->protocol = eth_type_trans(skb, dev);
1819                 skb_postpull_rcsum(skb, eth_hdr(skb), ETH_HLEN);
1820         }
1821
1822         return ret;
1823 }
1824 EXPORT_SYMBOL_GPL(__dev_forward_skb);
1825
1826 /**
1827  * dev_forward_skb - loopback an skb to another netif
1828  *
1829  * @dev: destination network device
1830  * @skb: buffer to forward
1831  *
1832  * return values:
1833  *      NET_RX_SUCCESS  (no congestion)
1834  *      NET_RX_DROP     (packet was dropped, but freed)
1835  *
1836  * dev_forward_skb can be used for injecting an skb from the
1837  * start_xmit function of one device into the receive queue
1838  * of another device.
1839  *
1840  * The receiving device may be in another namespace, so
1841  * we have to clear all information in the skb that could
1842  * impact namespace isolation.
1843  */
1844 int dev_forward_skb(struct net_device *dev, struct sk_buff *skb)
1845 {
1846         return __dev_forward_skb(dev, skb) ?: netif_rx_internal(skb);
1847 }
1848 EXPORT_SYMBOL_GPL(dev_forward_skb);
1849
1850 static inline int deliver_skb(struct sk_buff *skb,
1851                               struct packet_type *pt_prev,
1852                               struct net_device *orig_dev)
1853 {
1854         if (unlikely(skb_orphan_frags(skb, GFP_ATOMIC)))
1855                 return -ENOMEM;
1856         atomic_inc(&skb->users);
1857         return pt_prev->func(skb, skb->dev, pt_prev, orig_dev);
1858 }
1859
1860 static inline void deliver_ptype_list_skb(struct sk_buff *skb,
1861                                           struct packet_type **pt,
1862                                           struct net_device *orig_dev,
1863                                           __be16 type,
1864                                           struct list_head *ptype_list)
1865 {
1866         struct packet_type *ptype, *pt_prev = *pt;
1867
1868         list_for_each_entry_rcu(ptype, ptype_list, list) {
1869                 if (ptype->type != type)
1870                         continue;
1871                 if (pt_prev)
1872                         deliver_skb(skb, pt_prev, orig_dev);
1873                 pt_prev = ptype;
1874         }
1875         *pt = pt_prev;
1876 }
1877
1878 static inline bool skb_loop_sk(struct packet_type *ptype, struct sk_buff *skb)
1879 {
1880         if (!ptype->af_packet_priv || !skb->sk)
1881                 return false;
1882
1883         if (ptype->id_match)
1884                 return ptype->id_match(ptype, skb->sk);
1885         else if ((struct sock *)ptype->af_packet_priv == skb->sk)
1886                 return true;
1887
1888         return false;
1889 }
1890
1891 /*
1892  *      Support routine. Sends outgoing frames to any network
1893  *      taps currently in use.
1894  */
1895
1896 void dev_queue_xmit_nit(struct sk_buff *skb, struct net_device *dev)
1897 {
1898         struct packet_type *ptype;
1899         struct sk_buff *skb2 = NULL;
1900         struct packet_type *pt_prev = NULL;
1901         struct list_head *ptype_list = &ptype_all;
1902
1903         rcu_read_lock();
1904 again:
1905         list_for_each_entry_rcu(ptype, ptype_list, list) {
1906                 /* Never send packets back to the socket
1907                  * they originated from - MvS (miquels@drinkel.ow.org)
1908                  */
1909                 if (skb_loop_sk(ptype, skb))
1910                         continue;
1911
1912                 if (pt_prev) {
1913                         deliver_skb(skb2, pt_prev, skb->dev);
1914                         pt_prev = ptype;
1915                         continue;
1916                 }
1917
1918                 /* need to clone skb, done only once */
1919                 skb2 = skb_clone(skb, GFP_ATOMIC);
1920                 if (!skb2)
1921                         goto out_unlock;
1922
1923                 net_timestamp_set(skb2);
1924
1925                 /* skb->nh should be correctly
1926                  * set by sender, so that the second statement is
1927                  * just protection against buggy protocols.
1928                  */
1929                 skb_reset_mac_header(skb2);
1930
1931                 if (skb_network_header(skb2) < skb2->data ||
1932                     skb_network_header(skb2) > skb_tail_pointer(skb2)) {
1933                         net_crit_ratelimited("protocol %04x is buggy, dev %s\n",
1934                                              ntohs(skb2->protocol),
1935                                              dev->name);
1936                         skb_reset_network_header(skb2);
1937                 }
1938
1939                 skb2->transport_header = skb2->network_header;
1940                 skb2->pkt_type = PACKET_OUTGOING;
1941                 pt_prev = ptype;
1942         }
1943
1944         if (ptype_list == &ptype_all) {
1945                 ptype_list = &dev->ptype_all;
1946                 goto again;
1947         }
1948 out_unlock:
1949         if (pt_prev)
1950                 pt_prev->func(skb2, skb->dev, pt_prev, skb->dev);
1951         rcu_read_unlock();
1952 }
1953 EXPORT_SYMBOL_GPL(dev_queue_xmit_nit);
1954
1955 /**
1956  * netif_setup_tc - Handle tc mappings on real_num_tx_queues change
1957  * @dev: Network device
1958  * @txq: number of queues available
1959  *
1960  * If real_num_tx_queues is changed the tc mappings may no longer be
1961  * valid. To resolve this verify the tc mapping remains valid and if
1962  * not NULL the mapping. With no priorities mapping to this
1963  * offset/count pair it will no longer be used. In the worst case TC0
1964  * is invalid nothing can be done so disable priority mappings. If is
1965  * expected that drivers will fix this mapping if they can before
1966  * calling netif_set_real_num_tx_queues.
1967  */
1968 static void netif_setup_tc(struct net_device *dev, unsigned int txq)
1969 {
1970         int i;
1971         struct netdev_tc_txq *tc = &dev->tc_to_txq[0];
1972
1973         /* If TC0 is invalidated disable TC mapping */
1974         if (tc->offset + tc->count > txq) {
1975                 pr_warn("Number of in use tx queues changed invalidating tc mappings. Priority traffic classification disabled!\n");
1976                 dev->num_tc = 0;
1977                 return;
1978         }
1979
1980         /* Invalidated prio to tc mappings set to TC0 */
1981         for (i = 1; i < TC_BITMASK + 1; i++) {
1982                 int q = netdev_get_prio_tc_map(dev, i);
1983
1984                 tc = &dev->tc_to_txq[q];
1985                 if (tc->offset + tc->count > txq) {
1986                         pr_warn("Number of in use tx queues changed. Priority %i to tc mapping %i is no longer valid. Setting map to 0\n",
1987                                 i, q);
1988                         netdev_set_prio_tc_map(dev, i, 0);
1989                 }
1990         }
1991 }
1992
1993 #ifdef CONFIG_XPS
1994 static DEFINE_MUTEX(xps_map_mutex);
1995 #define xmap_dereference(P)             \
1996         rcu_dereference_protected((P), lockdep_is_held(&xps_map_mutex))
1997
1998 static struct xps_map *remove_xps_queue(struct xps_dev_maps *dev_maps,
1999                                         int cpu, u16 index)
2000 {
2001         struct xps_map *map = NULL;
2002         int pos;
2003
2004         if (dev_maps)
2005                 map = xmap_dereference(dev_maps->cpu_map[cpu]);
2006
2007         for (pos = 0; map && pos < map->len; pos++) {
2008                 if (map->queues[pos] == index) {
2009                         if (map->len > 1) {
2010                                 map->queues[pos] = map->queues[--map->len];
2011                         } else {
2012                                 RCU_INIT_POINTER(dev_maps->cpu_map[cpu], NULL);
2013                                 kfree_rcu(map, rcu);
2014                                 map = NULL;
2015                         }
2016                         break;
2017                 }
2018         }
2019
2020         return map;
2021 }
2022
2023 static void netif_reset_xps_queues_gt(struct net_device *dev, u16 index)
2024 {
2025         struct xps_dev_maps *dev_maps;
2026         int cpu, i;
2027         bool active = false;
2028
2029         mutex_lock(&xps_map_mutex);
2030         dev_maps = xmap_dereference(dev->xps_maps);
2031
2032         if (!dev_maps)
2033                 goto out_no_maps;
2034
2035         for_each_possible_cpu(cpu) {
2036                 for (i = index; i < dev->num_tx_queues; i++) {
2037                         if (!remove_xps_queue(dev_maps, cpu, i))
2038                                 break;
2039                 }
2040                 if (i == dev->num_tx_queues)
2041                         active = true;
2042         }
2043
2044         if (!active) {
2045                 RCU_INIT_POINTER(dev->xps_maps, NULL);
2046                 kfree_rcu(dev_maps, rcu);
2047         }
2048
2049         for (i = index; i < dev->num_tx_queues; i++)
2050                 netdev_queue_numa_node_write(netdev_get_tx_queue(dev, i),
2051                                              NUMA_NO_NODE);
2052
2053 out_no_maps:
2054         mutex_unlock(&xps_map_mutex);
2055 }
2056
2057 static struct xps_map *expand_xps_map(struct xps_map *map,
2058                                       int cpu, u16 index)
2059 {
2060         struct xps_map *new_map;
2061         int alloc_len = XPS_MIN_MAP_ALLOC;
2062         int i, pos;
2063
2064         for (pos = 0; map && pos < map->len; pos++) {
2065                 if (map->queues[pos] != index)
2066                         continue;
2067                 return map;
2068         }
2069
2070         /* Need to add queue to this CPU's existing map */
2071         if (map) {
2072                 if (pos < map->alloc_len)
2073                         return map;
2074
2075                 alloc_len = map->alloc_len * 2;
2076         }
2077
2078         /* Need to allocate new map to store queue on this CPU's map */
2079         new_map = kzalloc_node(XPS_MAP_SIZE(alloc_len), GFP_KERNEL,
2080                                cpu_to_node(cpu));
2081         if (!new_map)
2082                 return NULL;
2083
2084         for (i = 0; i < pos; i++)
2085                 new_map->queues[i] = map->queues[i];
2086         new_map->alloc_len = alloc_len;
2087         new_map->len = pos;
2088
2089         return new_map;
2090 }
2091
2092 int netif_set_xps_queue(struct net_device *dev, const struct cpumask *mask,
2093                         u16 index)
2094 {
2095         struct xps_dev_maps *dev_maps, *new_dev_maps = NULL;
2096         struct xps_map *map, *new_map;
2097         int maps_sz = max_t(unsigned int, XPS_DEV_MAPS_SIZE, L1_CACHE_BYTES);
2098         int cpu, numa_node_id = -2;
2099         bool active = false;
2100
2101         mutex_lock(&xps_map_mutex);
2102
2103         dev_maps = xmap_dereference(dev->xps_maps);
2104
2105         /* allocate memory for queue storage */
2106         for_each_online_cpu(cpu) {
2107                 if (!cpumask_test_cpu(cpu, mask))
2108                         continue;
2109
2110                 if (!new_dev_maps)
2111                         new_dev_maps = kzalloc(maps_sz, GFP_KERNEL);
2112                 if (!new_dev_maps) {
2113                         mutex_unlock(&xps_map_mutex);
2114                         return -ENOMEM;
2115                 }
2116
2117                 map = dev_maps ? xmap_dereference(dev_maps->cpu_map[cpu]) :
2118                                  NULL;
2119
2120                 map = expand_xps_map(map, cpu, index);
2121                 if (!map)
2122                         goto error;
2123
2124                 RCU_INIT_POINTER(new_dev_maps->cpu_map[cpu], map);
2125         }
2126
2127         if (!new_dev_maps)
2128                 goto out_no_new_maps;
2129
2130         for_each_possible_cpu(cpu) {
2131                 if (cpumask_test_cpu(cpu, mask) && cpu_online(cpu)) {
2132                         /* add queue to CPU maps */
2133                         int pos = 0;
2134
2135                         map = xmap_dereference(new_dev_maps->cpu_map[cpu]);
2136                         while ((pos < map->len) && (map->queues[pos] != index))
2137                                 pos++;
2138
2139                         if (pos == map->len)
2140                                 map->queues[map->len++] = index;
2141 #ifdef CONFIG_NUMA
2142                         if (numa_node_id == -2)
2143                                 numa_node_id = cpu_to_node(cpu);
2144                         else if (numa_node_id != cpu_to_node(cpu))
2145                                 numa_node_id = -1;
2146 #endif
2147                 } else if (dev_maps) {
2148                         /* fill in the new device map from the old device map */
2149                         map = xmap_dereference(dev_maps->cpu_map[cpu]);
2150                         RCU_INIT_POINTER(new_dev_maps->cpu_map[cpu], map);
2151                 }
2152
2153         }
2154
2155         rcu_assign_pointer(dev->xps_maps, new_dev_maps);
2156
2157         /* Cleanup old maps */
2158         if (dev_maps) {
2159                 for_each_possible_cpu(cpu) {
2160                         new_map = xmap_dereference(new_dev_maps->cpu_map[cpu]);
2161                         map = xmap_dereference(dev_maps->cpu_map[cpu]);
2162                         if (map && map != new_map)
2163                                 kfree_rcu(map, rcu);
2164                 }
2165
2166                 kfree_rcu(dev_maps, rcu);
2167         }
2168
2169         dev_maps = new_dev_maps;
2170         active = true;
2171
2172 out_no_new_maps:
2173         /* update Tx queue numa node */
2174         netdev_queue_numa_node_write(netdev_get_tx_queue(dev, index),
2175                                      (numa_node_id >= 0) ? numa_node_id :
2176                                      NUMA_NO_NODE);
2177
2178         if (!dev_maps)
2179                 goto out_no_maps;
2180
2181         /* removes queue from unused CPUs */
2182         for_each_possible_cpu(cpu) {
2183                 if (cpumask_test_cpu(cpu, mask) && cpu_online(cpu))
2184                         continue;
2185
2186                 if (remove_xps_queue(dev_maps, cpu, index))
2187                         active = true;
2188         }
2189
2190         /* free map if not active */
2191         if (!active) {
2192                 RCU_INIT_POINTER(dev->xps_maps, NULL);
2193                 kfree_rcu(dev_maps, rcu);
2194         }
2195
2196 out_no_maps:
2197         mutex_unlock(&xps_map_mutex);
2198
2199         return 0;
2200 error:
2201         /* remove any maps that we added */
2202         for_each_possible_cpu(cpu) {
2203                 new_map = xmap_dereference(new_dev_maps->cpu_map[cpu]);
2204                 map = dev_maps ? xmap_dereference(dev_maps->cpu_map[cpu]) :
2205                                  NULL;
2206                 if (new_map && new_map != map)
2207                         kfree(new_map);
2208         }
2209
2210         mutex_unlock(&xps_map_mutex);
2211
2212         kfree(new_dev_maps);
2213         return -ENOMEM;
2214 }
2215 EXPORT_SYMBOL(netif_set_xps_queue);
2216
2217 #endif
2218 /*
2219  * Routine to help set real_num_tx_queues. To avoid skbs mapped to queues
2220  * greater then real_num_tx_queues stale skbs on the qdisc must be flushed.
2221  */
2222 int netif_set_real_num_tx_queues(struct net_device *dev, unsigned int txq)
2223 {
2224         bool disabling;
2225         int rc;
2226
2227         disabling = txq < dev->real_num_tx_queues;
2228
2229         if (txq < 1 || txq > dev->num_tx_queues)
2230                 return -EINVAL;
2231
2232         if (dev->reg_state == NETREG_REGISTERED ||
2233             dev->reg_state == NETREG_UNREGISTERING) {
2234                 ASSERT_RTNL();
2235
2236                 rc = netdev_queue_update_kobjects(dev, dev->real_num_tx_queues,
2237                                                   txq);
2238                 if (rc)
2239                         return rc;
2240
2241                 if (dev->num_tc)
2242                         netif_setup_tc(dev, txq);
2243
2244                 dev->real_num_tx_queues = txq;
2245
2246                 if (disabling) {
2247                         synchronize_net();
2248                         qdisc_reset_all_tx_gt(dev, txq);
2249 #ifdef CONFIG_XPS
2250                         netif_reset_xps_queues_gt(dev, txq);
2251 #endif
2252                 }
2253         } else {
2254                 dev->real_num_tx_queues = txq;
2255         }
2256
2257         return 0;
2258 }
2259 EXPORT_SYMBOL(netif_set_real_num_tx_queues);
2260
2261 #ifdef CONFIG_SYSFS
2262 /**
2263  *      netif_set_real_num_rx_queues - set actual number of RX queues used
2264  *      @dev: Network device
2265  *      @rxq: Actual number of RX queues
2266  *
2267  *      This must be called either with the rtnl_lock held or before
2268  *      registration of the net device.  Returns 0 on success, or a
2269  *      negative error code.  If called before registration, it always
2270  *      succeeds.
2271  */
2272 int netif_set_real_num_rx_queues(struct net_device *dev, unsigned int rxq)
2273 {
2274         int rc;
2275
2276         if (rxq < 1 || rxq > dev->num_rx_queues)
2277                 return -EINVAL;
2278
2279         if (dev->reg_state == NETREG_REGISTERED) {
2280                 ASSERT_RTNL();
2281
2282                 rc = net_rx_queue_update_kobjects(dev, dev->real_num_rx_queues,
2283                                                   rxq);
2284                 if (rc)
2285                         return rc;
2286         }
2287
2288         dev->real_num_rx_queues = rxq;
2289         return 0;
2290 }
2291 EXPORT_SYMBOL(netif_set_real_num_rx_queues);
2292 #endif
2293
2294 /**
2295  * netif_get_num_default_rss_queues - default number of RSS queues
2296  *
2297  * This routine should set an upper limit on the number of RSS queues
2298  * used by default by multiqueue devices.
2299  */
2300 int netif_get_num_default_rss_queues(void)
2301 {
2302         return is_kdump_kernel() ?
2303                 1 : min_t(int, DEFAULT_MAX_NUM_RSS_QUEUES, num_online_cpus());
2304 }
2305 EXPORT_SYMBOL(netif_get_num_default_rss_queues);
2306
2307 static void __netif_reschedule(struct Qdisc *q)
2308 {
2309         struct softnet_data *sd;
2310         unsigned long flags;
2311
2312         local_irq_save(flags);
2313         sd = this_cpu_ptr(&softnet_data);
2314         q->next_sched = NULL;
2315         *sd->output_queue_tailp = q;
2316         sd->output_queue_tailp = &q->next_sched;
2317         raise_softirq_irqoff(NET_TX_SOFTIRQ);
2318         local_irq_restore(flags);
2319 }
2320
2321 void __netif_schedule(struct Qdisc *q)
2322 {
2323         if (!test_and_set_bit(__QDISC_STATE_SCHED, &q->state))
2324                 __netif_reschedule(q);
2325 }
2326 EXPORT_SYMBOL(__netif_schedule);
2327
2328 struct dev_kfree_skb_cb {
2329         enum skb_free_reason reason;
2330 };
2331
2332 static struct dev_kfree_skb_cb *get_kfree_skb_cb(const struct sk_buff *skb)
2333 {
2334         return (struct dev_kfree_skb_cb *)skb->cb;
2335 }
2336
2337 void netif_schedule_queue(struct netdev_queue *txq)
2338 {
2339         rcu_read_lock();
2340         if (!(txq->state & QUEUE_STATE_ANY_XOFF)) {
2341                 struct Qdisc *q = rcu_dereference(txq->qdisc);
2342
2343                 __netif_schedule(q);
2344         }
2345         rcu_read_unlock();
2346 }
2347 EXPORT_SYMBOL(netif_schedule_queue);
2348
2349 /**
2350  *      netif_wake_subqueue - allow sending packets on subqueue
2351  *      @dev: network device
2352  *      @queue_index: sub queue index
2353  *
2354  * Resume individual transmit queue of a device with multiple transmit queues.
2355  */
2356 void netif_wake_subqueue(struct net_device *dev, u16 queue_index)
2357 {
2358         struct netdev_queue *txq = netdev_get_tx_queue(dev, queue_index);
2359
2360         if (test_and_clear_bit(__QUEUE_STATE_DRV_XOFF, &txq->state)) {
2361                 struct Qdisc *q;
2362
2363                 rcu_read_lock();
2364                 q = rcu_dereference(txq->qdisc);
2365                 __netif_schedule(q);
2366                 rcu_read_unlock();
2367         }
2368 }
2369 EXPORT_SYMBOL(netif_wake_subqueue);
2370
2371 void netif_tx_wake_queue(struct netdev_queue *dev_queue)
2372 {
2373         if (test_and_clear_bit(__QUEUE_STATE_DRV_XOFF, &dev_queue->state)) {
2374                 struct Qdisc *q;
2375
2376                 rcu_read_lock();
2377                 q = rcu_dereference(dev_queue->qdisc);
2378                 __netif_schedule(q);
2379                 rcu_read_unlock();
2380         }
2381 }
2382 EXPORT_SYMBOL(netif_tx_wake_queue);
2383
2384 void __dev_kfree_skb_irq(struct sk_buff *skb, enum skb_free_reason reason)
2385 {
2386         unsigned long flags;
2387
2388         if (unlikely(!skb))
2389                 return;
2390
2391         if (likely(atomic_read(&skb->users) == 1)) {
2392                 smp_rmb();
2393                 atomic_set(&skb->users, 0);
2394         } else if (likely(!atomic_dec_and_test(&skb->users))) {
2395                 return;
2396         }
2397         get_kfree_skb_cb(skb)->reason = reason;
2398         local_irq_save(flags);
2399         skb->next = __this_cpu_read(softnet_data.completion_queue);
2400         __this_cpu_write(softnet_data.completion_queue, skb);
2401         raise_softirq_irqoff(NET_TX_SOFTIRQ);
2402         local_irq_restore(flags);
2403 }
2404 EXPORT_SYMBOL(__dev_kfree_skb_irq);
2405
2406 void __dev_kfree_skb_any(struct sk_buff *skb, enum skb_free_reason reason)
2407 {
2408         if (in_irq() || irqs_disabled())
2409                 __dev_kfree_skb_irq(skb, reason);
2410         else
2411                 dev_kfree_skb(skb);
2412 }
2413 EXPORT_SYMBOL(__dev_kfree_skb_any);
2414
2415
2416 /**
2417  * netif_device_detach - mark device as removed
2418  * @dev: network device
2419  *
2420  * Mark device as removed from system and therefore no longer available.
2421  */
2422 void netif_device_detach(struct net_device *dev)
2423 {
2424         if (test_and_clear_bit(__LINK_STATE_PRESENT, &dev->state) &&
2425             netif_running(dev)) {
2426                 netif_tx_stop_all_queues(dev);
2427         }
2428 }
2429 EXPORT_SYMBOL(netif_device_detach);
2430
2431 /**
2432  * netif_device_attach - mark device as attached
2433  * @dev: network device
2434  *
2435  * Mark device as attached from system and restart if needed.
2436  */
2437 void netif_device_attach(struct net_device *dev)
2438 {
2439         if (!test_and_set_bit(__LINK_STATE_PRESENT, &dev->state) &&
2440             netif_running(dev)) {
2441                 netif_tx_wake_all_queues(dev);
2442                 __netdev_watchdog_up(dev);
2443         }
2444 }
2445 EXPORT_SYMBOL(netif_device_attach);
2446
2447 /*
2448  * Returns a Tx hash based on the given packet descriptor a Tx queues' number
2449  * to be used as a distribution range.
2450  */
2451 u16 __skb_tx_hash(const struct net_device *dev, struct sk_buff *skb,
2452                   unsigned int num_tx_queues)
2453 {
2454         u32 hash;
2455         u16 qoffset = 0;
2456         u16 qcount = num_tx_queues;
2457
2458         if (skb_rx_queue_recorded(skb)) {
2459                 hash = skb_get_rx_queue(skb);
2460                 while (unlikely(hash >= num_tx_queues))
2461                         hash -= num_tx_queues;
2462                 return hash;
2463         }
2464
2465         if (dev->num_tc) {
2466                 u8 tc = netdev_get_prio_tc_map(dev, skb->priority);
2467                 qoffset = dev->tc_to_txq[tc].offset;
2468                 qcount = dev->tc_to_txq[tc].count;
2469         }
2470
2471         return (u16) reciprocal_scale(skb_get_hash(skb), qcount) + qoffset;
2472 }
2473 EXPORT_SYMBOL(__skb_tx_hash);
2474
2475 static void skb_warn_bad_offload(const struct sk_buff *skb)
2476 {
2477         static const netdev_features_t null_features;
2478         struct net_device *dev = skb->dev;
2479         const char *name = "";
2480
2481         if (!net_ratelimit())
2482                 return;
2483
2484         if (dev) {
2485                 if (dev->dev.parent)
2486                         name = dev_driver_string(dev->dev.parent);
2487                 else
2488                         name = netdev_name(dev);
2489         }
2490         WARN(1, "%s: caps=(%pNF, %pNF) len=%d data_len=%d gso_size=%d "
2491              "gso_type=%d ip_summed=%d\n",
2492              name, dev ? &dev->features : &null_features,
2493              skb->sk ? &skb->sk->sk_route_caps : &null_features,
2494              skb->len, skb->data_len, skb_shinfo(skb)->gso_size,
2495              skb_shinfo(skb)->gso_type, skb->ip_summed);
2496 }
2497
2498 /*
2499  * Invalidate hardware checksum when packet is to be mangled, and
2500  * complete checksum manually on outgoing path.
2501  */
2502 int skb_checksum_help(struct sk_buff *skb)
2503 {
2504         __wsum csum;
2505         int ret = 0, offset;
2506
2507         if (skb->ip_summed == CHECKSUM_COMPLETE)
2508                 goto out_set_summed;
2509
2510         if (unlikely(skb_shinfo(skb)->gso_size)) {
2511                 skb_warn_bad_offload(skb);
2512                 return -EINVAL;
2513         }
2514
2515         /* Before computing a checksum, we should make sure no frag could
2516          * be modified by an external entity : checksum could be wrong.
2517          */
2518         if (skb_has_shared_frag(skb)) {
2519                 ret = __skb_linearize(skb);
2520                 if (ret)
2521                         goto out;
2522         }
2523
2524         offset = skb_checksum_start_offset(skb);
2525         BUG_ON(offset >= skb_headlen(skb));
2526         csum = skb_checksum(skb, offset, skb->len - offset, 0);
2527
2528         offset += skb->csum_offset;
2529         BUG_ON(offset + sizeof(__sum16) > skb_headlen(skb));
2530
2531         if (skb_cloned(skb) &&
2532             !skb_clone_writable(skb, offset + sizeof(__sum16))) {
2533                 ret = pskb_expand_head(skb, 0, 0, GFP_ATOMIC);
2534                 if (ret)
2535                         goto out;
2536         }
2537
2538         *(__sum16 *)(skb->data + offset) = csum_fold(csum) ?: CSUM_MANGLED_0;
2539 out_set_summed:
2540         skb->ip_summed = CHECKSUM_NONE;
2541 out:
2542         return ret;
2543 }
2544 EXPORT_SYMBOL(skb_checksum_help);
2545
2546 /* skb_csum_offload_check - Driver helper function to determine if a device
2547  * with limited checksum offload capabilities is able to offload the checksum
2548  * for a given packet.
2549  *
2550  * Arguments:
2551  *   skb - sk_buff for the packet in question
2552  *   spec - contains the description of what device can offload
2553  *   csum_encapped - returns true if the checksum being offloaded is
2554  *            encpasulated. That is it is checksum for the transport header
2555  *            in the inner headers.
2556  *   checksum_help - when set indicates that helper function should
2557  *            call skb_checksum_help if offload checks fail
2558  *
2559  * Returns:
2560  *   true: Packet has passed the checksum checks and should be offloadable to
2561  *         the device (a driver may still need to check for additional
2562  *         restrictions of its device)
2563  *   false: Checksum is not offloadable. If checksum_help was set then
2564  *         skb_checksum_help was called to resolve checksum for non-GSO
2565  *         packets and when IP protocol is not SCTP
2566  */
2567 bool __skb_csum_offload_chk(struct sk_buff *skb,
2568                             const struct skb_csum_offl_spec *spec,
2569                             bool *csum_encapped,
2570                             bool csum_help)
2571 {
2572         struct iphdr *iph;
2573         struct ipv6hdr *ipv6;
2574         void *nhdr;
2575         int protocol;
2576         u8 ip_proto;
2577
2578         if (skb->protocol == htons(ETH_P_8021Q) ||
2579             skb->protocol == htons(ETH_P_8021AD)) {
2580                 if (!spec->vlan_okay)
2581                         goto need_help;
2582         }
2583
2584         /* We check whether the checksum refers to a transport layer checksum in
2585          * the outermost header or an encapsulated transport layer checksum that
2586          * corresponds to the inner headers of the skb. If the checksum is for
2587          * something else in the packet we need help.
2588          */
2589         if (skb_checksum_start_offset(skb) == skb_transport_offset(skb)) {
2590                 /* Non-encapsulated checksum */
2591                 protocol = eproto_to_ipproto(vlan_get_protocol(skb));
2592                 nhdr = skb_network_header(skb);
2593                 *csum_encapped = false;
2594                 if (spec->no_not_encapped)
2595                         goto need_help;
2596         } else if (skb->encapsulation && spec->encap_okay &&
2597                    skb_checksum_start_offset(skb) ==
2598                    skb_inner_transport_offset(skb)) {
2599                 /* Encapsulated checksum */
2600                 *csum_encapped = true;
2601                 switch (skb->inner_protocol_type) {
2602                 case ENCAP_TYPE_ETHER:
2603                         protocol = eproto_to_ipproto(skb->inner_protocol);
2604                         break;
2605                 case ENCAP_TYPE_IPPROTO:
2606                         protocol = skb->inner_protocol;
2607                         break;
2608                 }
2609                 nhdr = skb_inner_network_header(skb);
2610         } else {
2611                 goto need_help;
2612         }
2613
2614         switch (protocol) {
2615         case IPPROTO_IP:
2616                 if (!spec->ipv4_okay)
2617                         goto need_help;
2618                 iph = nhdr;
2619                 ip_proto = iph->protocol;
2620                 if (iph->ihl != 5 && !spec->ip_options_okay)
2621                         goto need_help;
2622                 break;
2623         case IPPROTO_IPV6:
2624                 if (!spec->ipv6_okay)
2625                         goto need_help;
2626                 if (spec->no_encapped_ipv6 && *csum_encapped)
2627                         goto need_help;
2628                 ipv6 = nhdr;
2629                 nhdr += sizeof(*ipv6);
2630                 ip_proto = ipv6->nexthdr;
2631                 break;
2632         default:
2633                 goto need_help;
2634         }
2635
2636 ip_proto_again:
2637         switch (ip_proto) {
2638         case IPPROTO_TCP:
2639                 if (!spec->tcp_okay ||
2640                     skb->csum_offset != offsetof(struct tcphdr, check))
2641                         goto need_help;
2642                 break;
2643         case IPPROTO_UDP:
2644                 if (!spec->udp_okay ||
2645                     skb->csum_offset != offsetof(struct udphdr, check))
2646                         goto need_help;
2647                 break;
2648         case IPPROTO_SCTP:
2649                 if (!spec->sctp_okay ||
2650                     skb->csum_offset != offsetof(struct sctphdr, checksum))
2651                         goto cant_help;
2652                 break;
2653         case NEXTHDR_HOP:
2654         case NEXTHDR_ROUTING:
2655         case NEXTHDR_DEST: {
2656                 u8 *opthdr = nhdr;
2657
2658                 if (protocol != IPPROTO_IPV6 || !spec->ext_hdrs_okay)
2659                         goto need_help;
2660
2661                 ip_proto = opthdr[0];
2662                 nhdr += (opthdr[1] + 1) << 3;
2663
2664                 goto ip_proto_again;
2665         }
2666         default:
2667                 goto need_help;
2668         }
2669
2670         /* Passed the tests for offloading checksum */
2671         return true;
2672
2673 need_help:
2674         if (csum_help && !skb_shinfo(skb)->gso_size)
2675                 skb_checksum_help(skb);
2676 cant_help:
2677         return false;
2678 }
2679 EXPORT_SYMBOL(__skb_csum_offload_chk);
2680
2681 __be16 skb_network_protocol(struct sk_buff *skb, int *depth)
2682 {
2683         __be16 type = skb->protocol;
2684
2685         /* Tunnel gso handlers can set protocol to ethernet. */
2686         if (type == htons(ETH_P_TEB)) {
2687                 struct ethhdr *eth;
2688
2689                 if (unlikely(!pskb_may_pull(skb, sizeof(struct ethhdr))))
2690                         return 0;
2691
2692                 eth = (struct ethhdr *)skb->data;
2693                 type = eth->h_proto;
2694         }
2695
2696         return __vlan_get_protocol(skb, type, depth);
2697 }
2698
2699 /**
2700  *      skb_mac_gso_segment - mac layer segmentation handler.
2701  *      @skb: buffer to segment
2702  *      @features: features for the output path (see dev->features)
2703  */
2704 struct sk_buff *skb_mac_gso_segment(struct sk_buff *skb,
2705                                     netdev_features_t features)
2706 {
2707         struct sk_buff *segs = ERR_PTR(-EPROTONOSUPPORT);
2708         struct packet_offload *ptype;
2709         int vlan_depth = skb->mac_len;
2710         __be16 type = skb_network_protocol(skb, &vlan_depth);
2711
2712         if (unlikely(!type))
2713                 return ERR_PTR(-EINVAL);
2714
2715         __skb_pull(skb, vlan_depth);
2716
2717         rcu_read_lock();
2718         list_for_each_entry_rcu(ptype, &offload_base, list) {
2719                 if (ptype->type == type && ptype->callbacks.gso_segment) {
2720                         segs = ptype->callbacks.gso_segment(skb, features);
2721                         break;
2722                 }
2723         }
2724         rcu_read_unlock();
2725
2726         __skb_push(skb, skb->data - skb_mac_header(skb));
2727
2728         return segs;
2729 }
2730 EXPORT_SYMBOL(skb_mac_gso_segment);
2731
2732
2733 /* openvswitch calls this on rx path, so we need a different check.
2734  */
2735 static inline bool skb_needs_check(struct sk_buff *skb, bool tx_path)
2736 {
2737         if (tx_path)
2738                 return skb->ip_summed != CHECKSUM_PARTIAL &&
2739                        skb->ip_summed != CHECKSUM_UNNECESSARY;
2740
2741         return skb->ip_summed == CHECKSUM_NONE;
2742 }
2743
2744 /**
2745  *      __skb_gso_segment - Perform segmentation on skb.
2746  *      @skb: buffer to segment
2747  *      @features: features for the output path (see dev->features)
2748  *      @tx_path: whether it is called in TX path
2749  *
2750  *      This function segments the given skb and returns a list of segments.
2751  *
2752  *      It may return NULL if the skb requires no segmentation.  This is
2753  *      only possible when GSO is used for verifying header integrity.
2754  *
2755  *      Segmentation preserves SKB_SGO_CB_OFFSET bytes of previous skb cb.
2756  */
2757 struct sk_buff *__skb_gso_segment(struct sk_buff *skb,
2758                                   netdev_features_t features, bool tx_path)
2759 {
2760         struct sk_buff *segs;
2761
2762         if (unlikely(skb_needs_check(skb, tx_path))) {
2763                 int err;
2764
2765                 /* We're going to init ->check field in TCP or UDP header */
2766                 err = skb_cow_head(skb, 0);
2767                 if (err < 0)
2768                         return ERR_PTR(err);
2769         }
2770
2771         /* Only report GSO partial support if it will enable us to
2772          * support segmentation on this frame without needing additional
2773          * work.
2774          */
2775         if (features & NETIF_F_GSO_PARTIAL) {
2776                 netdev_features_t partial_features = NETIF_F_GSO_ROBUST;
2777                 struct net_device *dev = skb->dev;
2778
2779                 partial_features |= dev->features & dev->gso_partial_features;
2780                 if (!skb_gso_ok(skb, features | partial_features))
2781                         features &= ~NETIF_F_GSO_PARTIAL;
2782         }
2783
2784         BUILD_BUG_ON(SKB_SGO_CB_OFFSET +
2785                      sizeof(*SKB_GSO_CB(skb)) > sizeof(skb->cb));
2786
2787         SKB_GSO_CB(skb)->mac_offset = skb_headroom(skb);
2788         SKB_GSO_CB(skb)->encap_level = 0;
2789
2790         skb_reset_mac_header(skb);
2791         skb_reset_mac_len(skb);
2792
2793         segs = skb_mac_gso_segment(skb, features);
2794
2795         if (unlikely(skb_needs_check(skb, tx_path) && !IS_ERR(segs)))
2796                 skb_warn_bad_offload(skb);
2797
2798         return segs;
2799 }
2800 EXPORT_SYMBOL(__skb_gso_segment);
2801
2802 /* Take action when hardware reception checksum errors are detected. */
2803 #ifdef CONFIG_BUG
2804 void netdev_rx_csum_fault(struct net_device *dev)
2805 {
2806         if (net_ratelimit()) {
2807                 pr_err("%s: hw csum failure\n", dev ? dev->name : "<unknown>");
2808                 dump_stack();
2809         }
2810 }
2811 EXPORT_SYMBOL(netdev_rx_csum_fault);
2812 #endif
2813
2814 /* Actually, we should eliminate this check as soon as we know, that:
2815  * 1. IOMMU is present and allows to map all the memory.
2816  * 2. No high memory really exists on this machine.
2817  */
2818
2819 static int illegal_highdma(struct net_device *dev, struct sk_buff *skb)
2820 {
2821 #ifdef CONFIG_HIGHMEM
2822         int i;
2823         if (!(dev->features & NETIF_F_HIGHDMA)) {
2824                 for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
2825                         skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
2826                         if (PageHighMem(skb_frag_page(frag)))
2827                                 return 1;
2828                 }
2829         }
2830
2831         if (PCI_DMA_BUS_IS_PHYS) {
2832                 struct device *pdev = dev->dev.parent;
2833
2834                 if (!pdev)
2835                         return 0;
2836                 for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
2837                         skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
2838                         dma_addr_t addr = page_to_phys(skb_frag_page(frag));
2839                         if (!pdev->dma_mask || addr + PAGE_SIZE - 1 > *pdev->dma_mask)
2840                                 return 1;
2841                 }
2842         }
2843 #endif
2844         return 0;
2845 }
2846
2847 /* If MPLS offload request, verify we are testing hardware MPLS features
2848  * instead of standard features for the netdev.
2849  */
2850 #if IS_ENABLED(CONFIG_NET_MPLS_GSO)
2851 static netdev_features_t net_mpls_features(struct sk_buff *skb,
2852                                            netdev_features_t features,
2853                                            __be16 type)
2854 {
2855         if (eth_p_mpls(type))
2856                 features &= skb->dev->mpls_features;
2857
2858         return features;
2859 }
2860 #else
2861 static netdev_features_t net_mpls_features(struct sk_buff *skb,
2862                                            netdev_features_t features,
2863                                            __be16 type)
2864 {
2865         return features;
2866 }
2867 #endif
2868
2869 static netdev_features_t harmonize_features(struct sk_buff *skb,
2870         netdev_features_t features)
2871 {
2872         int tmp;
2873         __be16 type;
2874
2875         type = skb_network_protocol(skb, &tmp);
2876         features = net_mpls_features(skb, features, type);
2877
2878         if (skb->ip_summed != CHECKSUM_NONE &&
2879             !can_checksum_protocol(features, type)) {
2880                 features &= ~(NETIF_F_CSUM_MASK | NETIF_F_GSO_MASK);
2881         }
2882         if (illegal_highdma(skb->dev, skb))
2883                 features &= ~NETIF_F_SG;
2884
2885         return features;
2886 }
2887
2888 netdev_features_t passthru_features_check(struct sk_buff *skb,
2889                                           struct net_device *dev,
2890                                           netdev_features_t features)
2891 {
2892         return features;
2893 }
2894 EXPORT_SYMBOL(passthru_features_check);
2895
2896 static netdev_features_t dflt_features_check(struct sk_buff *skb,
2897                                              struct net_device *dev,
2898                                              netdev_features_t features)
2899 {
2900         return vlan_features_check(skb, features);
2901 }
2902
2903 static netdev_features_t gso_features_check(const struct sk_buff *skb,
2904                                             struct net_device *dev,
2905                                             netdev_features_t features)
2906 {
2907         u16 gso_segs = skb_shinfo(skb)->gso_segs;
2908
2909         if (gso_segs > dev->gso_max_segs)
2910                 return features & ~NETIF_F_GSO_MASK;
2911
2912         /* Support for GSO partial features requires software
2913          * intervention before we can actually process the packets
2914          * so we need to strip support for any partial features now
2915          * and we can pull them back in after we have partially
2916          * segmented the frame.
2917          */
2918         if (!(skb_shinfo(skb)->gso_type & SKB_GSO_PARTIAL))
2919                 features &= ~dev->gso_partial_features;
2920
2921         /* Make sure to clear the IPv4 ID mangling feature if the
2922          * IPv4 header has the potential to be fragmented.
2923          */
2924         if (skb_shinfo(skb)->gso_type & SKB_GSO_TCPV4) {
2925                 struct iphdr *iph = skb->encapsulation ?
2926                                     inner_ip_hdr(skb) : ip_hdr(skb);
2927
2928                 if (!(iph->frag_off & htons(IP_DF)))
2929                         features &= ~NETIF_F_TSO_MANGLEID;
2930         }
2931
2932         return features;
2933 }
2934
2935 netdev_features_t netif_skb_features(struct sk_buff *skb)
2936 {
2937         struct net_device *dev = skb->dev;
2938         netdev_features_t features = dev->features;
2939
2940         if (skb_is_gso(skb))
2941                 features = gso_features_check(skb, dev, features);
2942
2943         /* If encapsulation offload request, verify we are testing
2944          * hardware encapsulation features instead of standard
2945          * features for the netdev
2946          */
2947         if (skb->encapsulation)
2948                 features &= dev->hw_enc_features;
2949
2950         if (skb_vlan_tagged(skb))
2951                 features = netdev_intersect_features(features,
2952                                                      dev->vlan_features |
2953                                                      NETIF_F_HW_VLAN_CTAG_TX |
2954                                                      NETIF_F_HW_VLAN_STAG_TX);
2955
2956         if (dev->netdev_ops->ndo_features_check)
2957                 features &= dev->netdev_ops->ndo_features_check(skb, dev,
2958                                                                 features);
2959         else
2960                 features &= dflt_features_check(skb, dev, features);
2961
2962         return harmonize_features(skb, features);
2963 }
2964 EXPORT_SYMBOL(netif_skb_features);
2965
2966 static int xmit_one(struct sk_buff *skb, struct net_device *dev,
2967                     struct netdev_queue *txq, bool more)
2968 {
2969         unsigned int len;
2970         int rc;
2971
2972         if (!list_empty(&ptype_all) || !list_empty(&dev->ptype_all))
2973                 dev_queue_xmit_nit(skb, dev);
2974
2975         len = skb->len;
2976         trace_net_dev_start_xmit(skb, dev);
2977         rc = netdev_start_xmit(skb, dev, txq, more);
2978         trace_net_dev_xmit(skb, rc, dev, len);
2979
2980         return rc;
2981 }
2982
2983 struct sk_buff *dev_hard_start_xmit(struct sk_buff *first, struct net_device *dev,
2984                                     struct netdev_queue *txq, int *ret)
2985 {
2986         struct sk_buff *skb = first;
2987         int rc = NETDEV_TX_OK;
2988
2989         while (skb) {
2990                 struct sk_buff *next = skb->next;
2991
2992                 skb->next = NULL;
2993                 rc = xmit_one(skb, dev, txq, next != NULL);
2994                 if (unlikely(!dev_xmit_complete(rc))) {
2995                         skb->next = next;
2996                         goto out;
2997                 }
2998
2999                 skb = next;
3000                 if (netif_xmit_stopped(txq) && skb) {
3001                         rc = NETDEV_TX_BUSY;
3002                         break;
3003                 }
3004         }
3005
3006 out:
3007         *ret = rc;
3008         return skb;
3009 }
3010
3011 static struct sk_buff *validate_xmit_vlan(struct sk_buff *skb,
3012                                           netdev_features_t features)
3013 {
3014         if (skb_vlan_tag_present(skb) &&
3015             !vlan_hw_offload_capable(features, skb->vlan_proto))
3016                 skb = __vlan_hwaccel_push_inside(skb);
3017         return skb;
3018 }
3019
3020 static struct sk_buff *validate_xmit_skb(struct sk_buff *skb, struct net_device *dev)
3021 {
3022         netdev_features_t features;
3023
3024         features = netif_skb_features(skb);
3025         skb = validate_xmit_vlan(skb, features);
3026         if (unlikely(!skb))
3027                 goto out_null;
3028
3029         if (netif_needs_gso(skb, features)) {
3030                 struct sk_buff *segs;
3031
3032                 segs = skb_gso_segment(skb, features);
3033                 if (IS_ERR(segs)) {
3034                         goto out_kfree_skb;
3035                 } else if (segs) {
3036                         consume_skb(skb);
3037                         skb = segs;
3038                 }
3039         } else {
3040                 if (skb_needs_linearize(skb, features) &&
3041                     __skb_linearize(skb))
3042                         goto out_kfree_skb;
3043
3044                 /* If packet is not checksummed and device does not
3045                  * support checksumming for this protocol, complete
3046                  * checksumming here.
3047                  */
3048                 if (skb->ip_summed == CHECKSUM_PARTIAL) {
3049                         if (skb->encapsulation)
3050                                 skb_set_inner_transport_header(skb,
3051                                                                skb_checksum_start_offset(skb));
3052                         else
3053                                 skb_set_transport_header(skb,
3054                                                          skb_checksum_start_offset(skb));
3055                         if (!(features & NETIF_F_CSUM_MASK) &&
3056                             skb_checksum_help(skb))
3057                                 goto out_kfree_skb;
3058                 }
3059         }
3060
3061         return skb;
3062
3063 out_kfree_skb:
3064         kfree_skb(skb);
3065 out_null:
3066         atomic_long_inc(&dev->tx_dropped);
3067         return NULL;
3068 }
3069
3070 struct sk_buff *validate_xmit_skb_list(struct sk_buff *skb, struct net_device *dev)
3071 {
3072         struct sk_buff *next, *head = NULL, *tail;
3073
3074         for (; skb != NULL; skb = next) {
3075                 next = skb->next;
3076                 skb->next = NULL;
3077
3078                 /* in case skb wont be segmented, point to itself */
3079                 skb->prev = skb;
3080
3081                 skb = validate_xmit_skb(skb, dev);
3082                 if (!skb)
3083                         continue;
3084
3085                 if (!head)
3086                         head = skb;
3087                 else
3088                         tail->next = skb;
3089                 /* If skb was segmented, skb->prev points to
3090                  * the last segment. If not, it still contains skb.
3091                  */
3092                 tail = skb->prev;
3093         }
3094         return head;
3095 }
3096 EXPORT_SYMBOL_GPL(validate_xmit_skb_list);
3097
3098 static void qdisc_pkt_len_init(struct sk_buff *skb)
3099 {
3100         const struct skb_shared_info *shinfo = skb_shinfo(skb);
3101
3102         qdisc_skb_cb(skb)->pkt_len = skb->len;
3103
3104         /* To get more precise estimation of bytes sent on wire,
3105          * we add to pkt_len the headers size of all segments
3106          */
3107         if (shinfo->gso_size)  {
3108                 unsigned int hdr_len;
3109                 u16 gso_segs = shinfo->gso_segs;
3110
3111                 /* mac layer + network layer */
3112                 hdr_len = skb_transport_header(skb) - skb_mac_header(skb);
3113
3114                 /* + transport layer */
3115                 if (likely(shinfo->gso_type & (SKB_GSO_TCPV4 | SKB_GSO_TCPV6))) {
3116                         const struct tcphdr *th;
3117                         struct tcphdr _tcphdr;
3118
3119                         th = skb_header_pointer(skb, skb_transport_offset(skb),
3120                                                 sizeof(_tcphdr), &_tcphdr);
3121                         if (likely(th))
3122                                 hdr_len += __tcp_hdrlen(th);
3123                 } else {
3124                         struct udphdr _udphdr;
3125
3126                         if (skb_header_pointer(skb, skb_transport_offset(skb),
3127                                                sizeof(_udphdr), &_udphdr))
3128                                 hdr_len += sizeof(struct udphdr);
3129                 }
3130
3131                 if (shinfo->gso_type & SKB_GSO_DODGY)
3132                         gso_segs = DIV_ROUND_UP(skb->len - hdr_len,
3133                                                 shinfo->gso_size);
3134
3135                 qdisc_skb_cb(skb)->pkt_len += (gso_segs - 1) * hdr_len;
3136         }
3137 }
3138
3139 static inline int __dev_xmit_skb(struct sk_buff *skb, struct Qdisc *q,
3140                                  struct net_device *dev,
3141                                  struct netdev_queue *txq)
3142 {
3143         spinlock_t *root_lock = qdisc_lock(q);
3144         struct sk_buff *to_free = NULL;
3145         bool contended;
3146         int rc;
3147
3148         qdisc_calculate_pkt_len(skb, q);
3149         /*
3150          * Heuristic to force contended enqueues to serialize on a
3151          * separate lock before trying to get qdisc main lock.
3152          * This permits qdisc->running owner to get the lock more
3153          * often and dequeue packets faster.
3154          */
3155         contended = qdisc_is_running(q);
3156         if (unlikely(contended))
3157                 spin_lock(&q->busylock);
3158
3159         spin_lock(root_lock);
3160         if (unlikely(test_bit(__QDISC_STATE_DEACTIVATED, &q->state))) {
3161                 __qdisc_drop(skb, &to_free);
3162                 rc = NET_XMIT_DROP;
3163         } else if ((q->flags & TCQ_F_CAN_BYPASS) && !qdisc_qlen(q) &&
3164                    qdisc_run_begin(q)) {
3165                 /*
3166                  * This is a work-conserving queue; there are no old skbs
3167                  * waiting to be sent out; and the qdisc is not running -
3168                  * xmit the skb directly.
3169                  */
3170
3171                 qdisc_bstats_update(q, skb);
3172
3173                 if (sch_direct_xmit(skb, q, dev, txq, root_lock, true)) {
3174                         if (unlikely(contended)) {
3175                                 spin_unlock(&q->busylock);
3176                                 contended = false;
3177                         }
3178                         __qdisc_run(q);
3179                 } else
3180                         qdisc_run_end(q);
3181
3182                 rc = NET_XMIT_SUCCESS;
3183         } else {
3184                 rc = q->enqueue(skb, q, &to_free) & NET_XMIT_MASK;
3185                 if (qdisc_run_begin(q)) {
3186                         if (unlikely(contended)) {
3187                                 spin_unlock(&q->busylock);
3188                                 contended = false;
3189                         }
3190                         __qdisc_run(q);
3191                 }
3192         }
3193         spin_unlock(root_lock);
3194         if (unlikely(to_free))
3195                 kfree_skb_list(to_free);
3196         if (unlikely(contended))
3197                 spin_unlock(&q->busylock);
3198         return rc;
3199 }
3200
3201 #if IS_ENABLED(CONFIG_CGROUP_NET_PRIO)
3202 static void skb_update_prio(struct sk_buff *skb)
3203 {
3204         const struct netprio_map *map;
3205         const struct sock *sk;
3206         unsigned int prioidx;
3207
3208         if (skb->priority)
3209                 return;
3210         map = rcu_dereference_bh(skb->dev->priomap);
3211         if (!map)
3212                 return;
3213         sk = skb_to_full_sk(skb);
3214         if (!sk)
3215                 return;
3216
3217         prioidx = sock_cgroup_prioidx(&sk->sk_cgrp_data);
3218
3219         if (prioidx < map->priomap_len)
3220                 skb->priority = map->priomap[prioidx];
3221 }
3222 #else
3223 #define skb_update_prio(skb)
3224 #endif
3225
3226 DEFINE_PER_CPU(int, xmit_recursion);
3227 EXPORT_SYMBOL(xmit_recursion);
3228
3229 /**
3230  *      dev_loopback_xmit - loop back @skb
3231  *      @net: network namespace this loopback is happening in
3232  *      @sk:  sk needed to be a netfilter okfn
3233  *      @skb: buffer to transmit
3234  */
3235 int dev_loopback_xmit(struct net *net, struct sock *sk, struct sk_buff *skb)
3236 {
3237         skb_reset_mac_header(skb);
3238         __skb_pull(skb, skb_network_offset(skb));
3239         skb->pkt_type = PACKET_LOOPBACK;
3240         skb->ip_summed = CHECKSUM_UNNECESSARY;
3241         WARN_ON(!skb_dst(skb));
3242         skb_dst_force(skb);
3243         netif_rx_ni(skb);
3244         return 0;
3245 }
3246 EXPORT_SYMBOL(dev_loopback_xmit);
3247
3248 #ifdef CONFIG_NET_EGRESS
3249 static struct sk_buff *
3250 sch_handle_egress(struct sk_buff *skb, int *ret, struct net_device *dev)
3251 {
3252         struct tcf_proto *cl = rcu_dereference_bh(dev->egress_cl_list);
3253         struct tcf_result cl_res;
3254
3255         if (!cl)
3256                 return skb;
3257
3258         /* skb->tc_verd and qdisc_skb_cb(skb)->pkt_len were already set
3259          * earlier by the caller.
3260          */
3261         qdisc_bstats_cpu_update(cl->q, skb);
3262
3263         switch (tc_classify(skb, cl, &cl_res, false)) {
3264         case TC_ACT_OK:
3265         case TC_ACT_RECLASSIFY:
3266                 skb->tc_index = TC_H_MIN(cl_res.classid);
3267                 break;
3268         case TC_ACT_SHOT:
3269                 qdisc_qstats_cpu_drop(cl->q);
3270                 *ret = NET_XMIT_DROP;
3271                 kfree_skb(skb);
3272                 return NULL;
3273         case TC_ACT_STOLEN:
3274         case TC_ACT_QUEUED:
3275                 *ret = NET_XMIT_SUCCESS;
3276                 consume_skb(skb);
3277                 return NULL;
3278         case TC_ACT_REDIRECT:
3279                 /* No need to push/pop skb's mac_header here on egress! */
3280                 skb_do_redirect(skb);
3281                 *ret = NET_XMIT_SUCCESS;
3282                 return NULL;
3283         default:
3284                 break;
3285         }
3286
3287         return skb;
3288 }
3289 #endif /* CONFIG_NET_EGRESS */
3290
3291 static inline int get_xps_queue(struct net_device *dev, struct sk_buff *skb)
3292 {
3293 #ifdef CONFIG_XPS
3294         struct xps_dev_maps *dev_maps;
3295         struct xps_map *map;
3296         int queue_index = -1;
3297
3298         rcu_read_lock();
3299         dev_maps = rcu_dereference(dev->xps_maps);
3300         if (dev_maps) {
3301                 map = rcu_dereference(
3302                     dev_maps->cpu_map[skb->sender_cpu - 1]);
3303                 if (map) {
3304                         if (map->len == 1)
3305                                 queue_index = map->queues[0];
3306                         else
3307                                 queue_index = map->queues[reciprocal_scale(skb_get_hash(skb),
3308                                                                            map->len)];
3309                         if (unlikely(queue_index >= dev->real_num_tx_queues))
3310                                 queue_index = -1;
3311                 }
3312         }
3313         rcu_read_unlock();
3314
3315         return queue_index;
3316 #else
3317         return -1;
3318 #endif
3319 }
3320
3321 static u16 __netdev_pick_tx(struct net_device *dev, struct sk_buff *skb)
3322 {
3323         struct sock *sk = skb->sk;
3324         int queue_index = sk_tx_queue_get(sk);
3325
3326         if (queue_index < 0 || skb->ooo_okay ||
3327             queue_index >= dev->real_num_tx_queues) {
3328                 int new_index = get_xps_queue(dev, skb);
3329                 if (new_index < 0)
3330                         new_index = skb_tx_hash(dev, skb);
3331
3332                 if (queue_index != new_index && sk &&
3333                     sk_fullsock(sk) &&
3334                     rcu_access_pointer(sk->sk_dst_cache))
3335                         sk_tx_queue_set(sk, new_index);
3336
3337                 queue_index = new_index;
3338         }
3339
3340         return queue_index;
3341 }
3342
3343 struct netdev_queue *netdev_pick_tx(struct net_device *dev,
3344                                     struct sk_buff *skb,
3345                                     void *accel_priv)
3346 {
3347         int queue_index = 0;
3348
3349 #ifdef CONFIG_XPS
3350         u32 sender_cpu = skb->sender_cpu - 1;
3351
3352         if (sender_cpu >= (u32)NR_CPUS)
3353                 skb->sender_cpu = raw_smp_processor_id() + 1;
3354 #endif
3355
3356         if (dev->real_num_tx_queues != 1) {
3357                 const struct net_device_ops *ops = dev->netdev_ops;
3358                 if (ops->ndo_select_queue)
3359                         queue_index = ops->ndo_select_queue(dev, skb, accel_priv,
3360                                                             __netdev_pick_tx);
3361                 else
3362                         queue_index = __netdev_pick_tx(dev, skb);
3363
3364                 if (!accel_priv)
3365                         queue_index = netdev_cap_txqueue(dev, queue_index);
3366         }
3367
3368         skb_set_queue_mapping(skb, queue_index);
3369         return netdev_get_tx_queue(dev, queue_index);
3370 }
3371
3372 /**
3373  *      __dev_queue_xmit - transmit a buffer
3374  *      @skb: buffer to transmit
3375  *      @accel_priv: private data used for L2 forwarding offload
3376  *
3377  *      Queue a buffer for transmission to a network device. The caller must
3378  *      have set the device and priority and built the buffer before calling
3379  *      this function. The function can be called from an interrupt.
3380  *
3381  *      A negative errno code is returned on a failure. A success does not
3382  *      guarantee the frame will be transmitted as it may be dropped due
3383  *      to congestion or traffic shaping.
3384  *
3385  * -----------------------------------------------------------------------------------
3386  *      I notice this method can also return errors from the queue disciplines,
3387  *      including NET_XMIT_DROP, which is a positive value.  So, errors can also
3388  *      be positive.
3389  *
3390  *      Regardless of the return value, the skb is consumed, so it is currently
3391  *      difficult to retry a send to this method.  (You can bump the ref count
3392  *      before sending to hold a reference for retry if you are careful.)
3393  *
3394  *      When calling this method, interrupts MUST be enabled.  This is because
3395  *      the BH enable code must have IRQs enabled so that it will not deadlock.
3396  *          --BLG
3397  */
3398 static int __dev_queue_xmit(struct sk_buff *skb, void *accel_priv)
3399 {
3400         struct net_device *dev = skb->dev;
3401         struct netdev_queue *txq;
3402         struct Qdisc *q;
3403         int rc = -ENOMEM;
3404
3405         skb_reset_mac_header(skb);
3406
3407         if (unlikely(skb_shinfo(skb)->tx_flags & SKBTX_SCHED_TSTAMP))
3408                 __skb_tstamp_tx(skb, NULL, skb->sk, SCM_TSTAMP_SCHED);
3409
3410         /* Disable soft irqs for various locks below. Also
3411          * stops preemption for RCU.
3412          */
3413         rcu_read_lock_bh();
3414
3415         skb_update_prio(skb);
3416
3417         qdisc_pkt_len_init(skb);
3418 #ifdef CONFIG_NET_CLS_ACT
3419         skb->tc_verd = SET_TC_AT(skb->tc_verd, AT_EGRESS);
3420 # ifdef CONFIG_NET_EGRESS
3421         if (static_key_false(&egress_needed)) {
3422                 skb = sch_handle_egress(skb, &rc, dev);
3423                 if (!skb)
3424                         goto out;
3425         }
3426 # endif
3427 #endif
3428         /* If device/qdisc don't need skb->dst, release it right now while
3429          * its hot in this cpu cache.
3430          */
3431         if (dev->priv_flags & IFF_XMIT_DST_RELEASE)
3432                 skb_dst_drop(skb);
3433         else
3434                 skb_dst_force(skb);
3435
3436         txq = netdev_pick_tx(dev, skb, accel_priv);
3437         q = rcu_dereference_bh(txq->qdisc);
3438
3439         trace_net_dev_queue(skb);
3440         if (q->enqueue) {
3441                 rc = __dev_xmit_skb(skb, q, dev, txq);
3442                 goto out;
3443         }
3444
3445         /* The device has no queue. Common case for software devices:
3446            loopback, all the sorts of tunnels...
3447
3448            Really, it is unlikely that netif_tx_lock protection is necessary
3449            here.  (f.e. loopback and IP tunnels are clean ignoring statistics
3450            counters.)
3451            However, it is possible, that they rely on protection
3452            made by us here.
3453
3454            Check this and shot the lock. It is not prone from deadlocks.
3455            Either shot noqueue qdisc, it is even simpler 8)
3456          */
3457         if (dev->flags & IFF_UP) {
3458                 int cpu = smp_processor_id(); /* ok because BHs are off */
3459
3460                 if (txq->xmit_lock_owner != cpu) {
3461                         if (unlikely(__this_cpu_read(xmit_recursion) >
3462                                      XMIT_RECURSION_LIMIT))
3463                                 goto recursion_alert;
3464
3465                         skb = validate_xmit_skb(skb, dev);
3466                         if (!skb)
3467                                 goto out;
3468
3469                         HARD_TX_LOCK(dev, txq, cpu);
3470
3471                         if (!netif_xmit_stopped(txq)) {
3472                                 __this_cpu_inc(xmit_recursion);
3473                                 skb = dev_hard_start_xmit(skb, dev, txq, &rc);
3474                                 __this_cpu_dec(xmit_recursion);
3475                                 if (dev_xmit_complete(rc)) {
3476                                         HARD_TX_UNLOCK(dev, txq);
3477                                         goto out;
3478                                 }
3479                         }
3480                         HARD_TX_UNLOCK(dev, txq);
3481                         net_crit_ratelimited("Virtual device %s asks to queue packet!\n",
3482                                              dev->name);
3483                 } else {
3484                         /* Recursion is detected! It is possible,
3485                          * unfortunately
3486                          */
3487 recursion_alert:
3488                         net_crit_ratelimited("Dead loop on virtual device %s, fix it urgently!\n",
3489                                              dev->name);
3490                 }
3491         }
3492
3493         rc = -ENETDOWN;
3494         rcu_read_unlock_bh();
3495
3496         atomic_long_inc(&dev->tx_dropped);
3497         kfree_skb_list(skb);
3498         return rc;
3499 out:
3500         rcu_read_unlock_bh();
3501         return rc;
3502 }
3503
3504 int dev_queue_xmit(struct sk_buff *skb)
3505 {
3506         return __dev_queue_xmit(skb, NULL);
3507 }
3508 EXPORT_SYMBOL(dev_queue_xmit);
3509
3510 int dev_queue_xmit_accel(struct sk_buff *skb, void *accel_priv)
3511 {
3512         return __dev_queue_xmit(skb, accel_priv);
3513 }
3514 EXPORT_SYMBOL(dev_queue_xmit_accel);
3515
3516
3517 /*=======================================================================
3518                         Receiver routines
3519   =======================================================================*/
3520
3521 int netdev_max_backlog __read_mostly = 1000;
3522 EXPORT_SYMBOL(netdev_max_backlog);
3523
3524 int netdev_tstamp_prequeue __read_mostly = 1;
3525 int netdev_budget __read_mostly = 300;
3526 int weight_p __read_mostly = 64;            /* old backlog weight */
3527
3528 /* Called with irq disabled */
3529 static inline void ____napi_schedule(struct softnet_data *sd,
3530                                      struct napi_struct *napi)
3531 {
3532         list_add_tail(&napi->poll_list, &sd->poll_list);
3533         __raise_softirq_irqoff(NET_RX_SOFTIRQ);
3534 }
3535
3536 #ifdef CONFIG_RPS
3537
3538 /* One global table that all flow-based protocols share. */
3539 struct rps_sock_flow_table __rcu *rps_sock_flow_table __read_mostly;
3540 EXPORT_SYMBOL(rps_sock_flow_table);
3541 u32 rps_cpu_mask __read_mostly;
3542 EXPORT_SYMBOL(rps_cpu_mask);
3543
3544 struct static_key rps_needed __read_mostly;
3545 EXPORT_SYMBOL(rps_needed);
3546
3547 static struct rps_dev_flow *
3548 set_rps_cpu(struct net_device *dev, struct sk_buff *skb,
3549             struct rps_dev_flow *rflow, u16 next_cpu)
3550 {
3551         if (next_cpu < nr_cpu_ids) {
3552 #ifdef CONFIG_RFS_ACCEL
3553                 struct netdev_rx_queue *rxqueue;
3554                 struct rps_dev_flow_table *flow_table;
3555                 struct rps_dev_flow *old_rflow;
3556                 u32 flow_id;
3557                 u16 rxq_index;
3558                 int rc;
3559
3560                 /* Should we steer this flow to a different hardware queue? */
3561                 if (!skb_rx_queue_recorded(skb) || !dev->rx_cpu_rmap ||
3562                     !(dev->features & NETIF_F_NTUPLE))
3563                         goto out;
3564                 rxq_index = cpu_rmap_lookup_index(dev->rx_cpu_rmap, next_cpu);
3565                 if (rxq_index == skb_get_rx_queue(skb))
3566                         goto out;
3567
3568                 rxqueue = dev->_rx + rxq_index;
3569                 flow_table = rcu_dereference(rxqueue->rps_flow_table);
3570                 if (!flow_table)
3571                         goto out;
3572                 flow_id = skb_get_hash(skb) & flow_table->mask;
3573                 rc = dev->netdev_ops->ndo_rx_flow_steer(dev, skb,
3574                                                         rxq_index, flow_id);
3575                 if (rc < 0)
3576                         goto out;
3577                 old_rflow = rflow;
3578                 rflow = &flow_table->flows[flow_id];
3579                 rflow->filter = rc;
3580                 if (old_rflow->filter == rflow->filter)
3581                         old_rflow->filter = RPS_NO_FILTER;
3582         out:
3583 #endif
3584                 rflow->last_qtail =
3585                         per_cpu(softnet_data, next_cpu).input_queue_head;
3586         }
3587
3588         rflow->cpu = next_cpu;
3589         return rflow;
3590 }
3591
3592 /*
3593  * get_rps_cpu is called from netif_receive_skb and returns the target
3594  * CPU from the RPS map of the receiving queue for a given skb.
3595  * rcu_read_lock must be held on entry.
3596  */
3597 static int get_rps_cpu(struct net_device *dev, struct sk_buff *skb,
3598                        struct rps_dev_flow **rflowp)
3599 {
3600         const struct rps_sock_flow_table *sock_flow_table;
3601         struct netdev_rx_queue *rxqueue = dev->_rx;
3602         struct rps_dev_flow_table *flow_table;
3603         struct rps_map *map;
3604         int cpu = -1;
3605         u32 tcpu;
3606         u32 hash;
3607
3608         if (skb_rx_queue_recorded(skb)) {
3609                 u16 index = skb_get_rx_queue(skb);
3610
3611                 if (unlikely(index >= dev->real_num_rx_queues)) {
3612                         WARN_ONCE(dev->real_num_rx_queues > 1,
3613                                   "%s received packet on queue %u, but number "
3614                                   "of RX queues is %u\n",
3615                                   dev->name, index, dev->real_num_rx_queues);
3616                         goto done;
3617                 }
3618                 rxqueue += index;
3619         }
3620
3621         /* Avoid computing hash if RFS/RPS is not active for this rxqueue */
3622
3623         flow_table = rcu_dereference(rxqueue->rps_flow_table);
3624         map = rcu_dereference(rxqueue->rps_map);
3625         if (!flow_table && !map)
3626                 goto done;
3627
3628         skb_reset_network_header(skb);
3629         hash = skb_get_hash(skb);
3630         if (!hash)
3631                 goto done;
3632
3633         sock_flow_table = rcu_dereference(rps_sock_flow_table);
3634         if (flow_table && sock_flow_table) {
3635                 struct rps_dev_flow *rflow;
3636                 u32 next_cpu;
3637                 u32 ident;
3638
3639                 /* First check into global flow table if there is a match */
3640                 ident = sock_flow_table->ents[hash & sock_flow_table->mask];
3641                 if ((ident ^ hash) & ~rps_cpu_mask)
3642                         goto try_rps;
3643
3644                 next_cpu = ident & rps_cpu_mask;
3645
3646                 /* OK, now we know there is a match,
3647                  * we can look at the local (per receive queue) flow table
3648                  */
3649                 rflow = &flow_table->flows[hash & flow_table->mask];
3650                 tcpu = rflow->cpu;
3651
3652                 /*
3653                  * If the desired CPU (where last recvmsg was done) is
3654                  * different from current CPU (one in the rx-queue flow
3655                  * table entry), switch if one of the following holds:
3656                  *   - Current CPU is unset (>= nr_cpu_ids).
3657                  *   - Current CPU is offline.
3658                  *   - The current CPU's queue tail has advanced beyond the
3659                  *     last packet that was enqueued using this table entry.
3660                  *     This guarantees that all previous packets for the flow
3661                  *     have been dequeued, thus preserving in order delivery.
3662                  */
3663                 if (unlikely(tcpu != next_cpu) &&
3664                     (tcpu >= nr_cpu_ids || !cpu_online(tcpu) ||
3665                      ((int)(per_cpu(softnet_data, tcpu).input_queue_head -
3666                       rflow->last_qtail)) >= 0)) {
3667                         tcpu = next_cpu;
3668                         rflow = set_rps_cpu(dev, skb, rflow, next_cpu);
3669                 }
3670
3671                 if (tcpu < nr_cpu_ids && cpu_online(tcpu)) {
3672                         *rflowp = rflow;
3673                         cpu = tcpu;
3674                         goto done;
3675                 }
3676         }
3677
3678 try_rps:
3679
3680         if (map) {
3681                 tcpu = map->cpus[reciprocal_scale(hash, map->len)];
3682                 if (cpu_online(tcpu)) {
3683                         cpu = tcpu;
3684                         goto done;
3685                 }
3686         }
3687
3688 done:
3689         return cpu;
3690 }
3691
3692 #ifdef CONFIG_RFS_ACCEL
3693
3694 /**
3695  * rps_may_expire_flow - check whether an RFS hardware filter may be removed
3696  * @dev: Device on which the filter was set
3697  * @rxq_index: RX queue index
3698  * @flow_id: Flow ID passed to ndo_rx_flow_steer()
3699  * @filter_id: Filter ID returned by ndo_rx_flow_steer()
3700  *
3701  * Drivers that implement ndo_rx_flow_steer() should periodically call
3702  * this function for each installed filter and remove the filters for
3703  * which it returns %true.
3704  */
3705 bool rps_may_expire_flow(struct net_device *dev, u16 rxq_index,
3706                          u32 flow_id, u16 filter_id)
3707 {
3708         struct netdev_rx_queue *rxqueue = dev->_rx + rxq_index;
3709         struct rps_dev_flow_table *flow_table;
3710         struct rps_dev_flow *rflow;
3711         bool expire = true;
3712         unsigned int cpu;
3713
3714         rcu_read_lock();
3715         flow_table = rcu_dereference(rxqueue->rps_flow_table);
3716         if (flow_table && flow_id <= flow_table->mask) {
3717                 rflow = &flow_table->flows[flow_id];
3718                 cpu = ACCESS_ONCE(rflow->cpu);
3719                 if (rflow->filter == filter_id && cpu < nr_cpu_ids &&
3720                     ((int)(per_cpu(softnet_data, cpu).input_queue_head -
3721                            rflow->last_qtail) <
3722                      (int)(10 * flow_table->mask)))
3723                         expire = false;
3724         }
3725         rcu_read_unlock();
3726         return expire;
3727 }
3728 EXPORT_SYMBOL(rps_may_expire_flow);
3729
3730 #endif /* CONFIG_RFS_ACCEL */
3731
3732 /* Called from hardirq (IPI) context */
3733 static void rps_trigger_softirq(void *data)
3734 {
3735         struct softnet_data *sd = data;
3736
3737         ____napi_schedule(sd, &sd->backlog);
3738         sd->received_rps++;
3739 }
3740
3741 #endif /* CONFIG_RPS */
3742
3743 /*
3744  * Check if this softnet_data structure is another cpu one
3745  * If yes, queue it to our IPI list and return 1
3746  * If no, return 0
3747  */
3748 static int rps_ipi_queued(struct softnet_data *sd)
3749 {
3750 #ifdef CONFIG_RPS
3751         struct softnet_data *mysd = this_cpu_ptr(&softnet_data);
3752
3753         if (sd != mysd) {
3754                 sd->rps_ipi_next = mysd->rps_ipi_list;
3755                 mysd->rps_ipi_list = sd;
3756
3757                 __raise_softirq_irqoff(NET_RX_SOFTIRQ);
3758                 return 1;
3759         }
3760 #endif /* CONFIG_RPS */
3761         return 0;
3762 }
3763
3764 #ifdef CONFIG_NET_FLOW_LIMIT
3765 int netdev_flow_limit_table_len __read_mostly = (1 << 12);
3766 #endif
3767
3768 static bool skb_flow_limit(struct sk_buff *skb, unsigned int qlen)
3769 {
3770 #ifdef CONFIG_NET_FLOW_LIMIT
3771         struct sd_flow_limit *fl;
3772         struct softnet_data *sd;
3773         unsigned int old_flow, new_flow;
3774
3775         if (qlen < (netdev_max_backlog >> 1))
3776                 return false;
3777
3778         sd = this_cpu_ptr(&softnet_data);
3779
3780         rcu_read_lock();
3781         fl = rcu_dereference(sd->flow_limit);
3782         if (fl) {
3783                 new_flow = skb_get_hash(skb) & (fl->num_buckets - 1);
3784                 old_flow = fl->history[fl->history_head];
3785                 fl->history[fl->history_head] = new_flow;
3786
3787                 fl->history_head++;
3788                 fl->history_head &= FLOW_LIMIT_HISTORY - 1;
3789
3790                 if (likely(fl->buckets[old_flow]))
3791                         fl->buckets[old_flow]--;
3792
3793                 if (++fl->buckets[new_flow] > (FLOW_LIMIT_HISTORY >> 1)) {
3794                         fl->count++;
3795                         rcu_read_unlock();
3796                         return true;
3797                 }
3798         }
3799         rcu_read_unlock();
3800 #endif
3801         return false;
3802 }
3803
3804 /*
3805  * enqueue_to_backlog is called to queue an skb to a per CPU backlog
3806  * queue (may be a remote CPU queue).
3807  */
3808 static int enqueue_to_backlog(struct sk_buff *skb, int cpu,
3809                               unsigned int *qtail)
3810 {
3811         struct softnet_data *sd;
3812         unsigned long flags;
3813         unsigned int qlen;
3814
3815         sd = &per_cpu(softnet_data, cpu);
3816
3817         local_irq_save(flags);
3818
3819         rps_lock(sd);
3820         if (!netif_running(skb->dev))
3821                 goto drop;
3822         qlen = skb_queue_len(&sd->input_pkt_queue);
3823         if (qlen <= netdev_max_backlog && !skb_flow_limit(skb, qlen)) {
3824                 if (qlen) {
3825 enqueue:
3826                         __skb_queue_tail(&sd->input_pkt_queue, skb);
3827                         input_queue_tail_incr_save(sd, qtail);
3828                         rps_unlock(sd);
3829                         local_irq_restore(flags);
3830                         return NET_RX_SUCCESS;
3831                 }
3832
3833                 /* Schedule NAPI for backlog device
3834                  * We can use non atomic operation since we own the queue lock
3835                  */
3836                 if (!__test_and_set_bit(NAPI_STATE_SCHED, &sd->backlog.state)) {
3837                         if (!rps_ipi_queued(sd))
3838                                 ____napi_schedule(sd, &sd->backlog);
3839                 }
3840                 goto enqueue;
3841         }
3842
3843 drop:
3844         sd->dropped++;
3845         rps_unlock(sd);
3846
3847         local_irq_restore(flags);
3848
3849         atomic_long_inc(&skb->dev->rx_dropped);
3850         kfree_skb(skb);
3851         return NET_RX_DROP;
3852 }
3853
3854 static int netif_rx_internal(struct sk_buff *skb)
3855 {
3856         int ret;
3857
3858         net_timestamp_check(netdev_tstamp_prequeue, skb);
3859
3860         trace_netif_rx(skb);
3861 #ifdef CONFIG_RPS
3862         if (static_key_false(&rps_needed)) {
3863                 struct rps_dev_flow voidflow, *rflow = &voidflow;
3864                 int cpu;
3865
3866                 preempt_disable();
3867                 rcu_read_lock();
3868
3869                 cpu = get_rps_cpu(skb->dev, skb, &rflow);
3870                 if (cpu < 0)
3871                         cpu = smp_processor_id();
3872
3873                 ret = enqueue_to_backlog(skb, cpu, &rflow->last_qtail);
3874
3875                 rcu_read_unlock();
3876                 preempt_enable();
3877         } else
3878 #endif
3879         {
3880                 unsigned int qtail;
3881                 ret = enqueue_to_backlog(skb, get_cpu(), &qtail);
3882                 put_cpu();
3883         }
3884         return ret;
3885 }
3886
3887 /**
3888  *      netif_rx        -       post buffer to the network code
3889  *      @skb: buffer to post
3890  *
3891  *      This function receives a packet from a device driver and queues it for
3892  *      the upper (protocol) levels to process.  It always succeeds. The buffer
3893  *      may be dropped during processing for congestion control or by the
3894  *      protocol layers.
3895  *
3896  *      return values:
3897  *      NET_RX_SUCCESS  (no congestion)
3898  *      NET_RX_DROP     (packet was dropped)
3899  *
3900  */
3901
3902 int netif_rx(struct sk_buff *skb)
3903 {
3904         trace_netif_rx_entry(skb);
3905
3906         return netif_rx_internal(skb);
3907 }
3908 EXPORT_SYMBOL(netif_rx);
3909
3910 int netif_rx_ni(struct sk_buff *skb)
3911 {
3912         int err;
3913
3914         trace_netif_rx_ni_entry(skb);
3915
3916         preempt_disable();
3917         err = netif_rx_internal(skb);
3918         if (local_softirq_pending())
3919                 do_softirq();
3920         preempt_enable();
3921
3922         return err;
3923 }
3924 EXPORT_SYMBOL(netif_rx_ni);
3925
3926 static __latent_entropy void net_tx_action(struct softirq_action *h)
3927 {
3928         struct softnet_data *sd = this_cpu_ptr(&softnet_data);
3929
3930         if (sd->completion_queue) {
3931                 struct sk_buff *clist;
3932
3933                 local_irq_disable();
3934                 clist = sd->completion_queue;
3935                 sd->completion_queue = NULL;
3936                 local_irq_enable();
3937
3938                 while (clist) {
3939                         struct sk_buff *skb = clist;
3940                         clist = clist->next;
3941
3942                         WARN_ON(atomic_read(&skb->users));
3943                         if (likely(get_kfree_skb_cb(skb)->reason == SKB_REASON_CONSUMED))
3944                                 trace_consume_skb(skb);
3945                         else
3946                                 trace_kfree_skb(skb, net_tx_action);
3947
3948                         if (skb->fclone != SKB_FCLONE_UNAVAILABLE)
3949                                 __kfree_skb(skb);
3950                         else
3951                                 __kfree_skb_defer(skb);
3952                 }
3953
3954                 __kfree_skb_flush();
3955         }
3956
3957         if (sd->output_queue) {
3958                 struct Qdisc *head;
3959
3960                 local_irq_disable();
3961                 head = sd->output_queue;
3962                 sd->output_queue = NULL;
3963                 sd->output_queue_tailp = &sd->output_queue;
3964                 local_irq_enable();
3965
3966                 while (head) {
3967                         struct Qdisc *q = head;
3968                         spinlock_t *root_lock;
3969
3970                         head = head->next_sched;
3971
3972                         root_lock = qdisc_lock(q);
3973                         spin_lock(root_lock);
3974                         /* We need to make sure head->next_sched is read
3975                          * before clearing __QDISC_STATE_SCHED
3976                          */
3977                         smp_mb__before_atomic();
3978                         clear_bit(__QDISC_STATE_SCHED, &q->state);
3979                         qdisc_run(q);
3980                         spin_unlock(root_lock);
3981                 }
3982         }
3983 }
3984
3985 #if IS_ENABLED(CONFIG_BRIDGE) && IS_ENABLED(CONFIG_ATM_LANE)
3986 /* This hook is defined here for ATM LANE */
3987 int (*br_fdb_test_addr_hook)(struct net_device *dev,
3988                              unsigned char *addr) __read_mostly;
3989 EXPORT_SYMBOL_GPL(br_fdb_test_addr_hook);
3990 #endif
3991
3992 static inline struct sk_buff *
3993 sch_handle_ingress(struct sk_buff *skb, struct packet_type **pt_prev, int *ret,
3994                    struct net_device *orig_dev)
3995 {
3996 #ifdef CONFIG_NET_CLS_ACT
3997         struct tcf_proto *cl = rcu_dereference_bh(skb->dev->ingress_cl_list);
3998         struct tcf_result cl_res;
3999
4000         /* If there's at least one ingress present somewhere (so
4001          * we get here via enabled static key), remaining devices
4002          * that are not configured with an ingress qdisc will bail
4003          * out here.
4004          */
4005         if (!cl)
4006                 return skb;
4007         if (*pt_prev) {
4008                 *ret = deliver_skb(skb, *pt_prev, orig_dev);
4009                 *pt_prev = NULL;
4010         }
4011
4012         qdisc_skb_cb(skb)->pkt_len = skb->len;
4013         skb->tc_verd = SET_TC_AT(skb->tc_verd, AT_INGRESS);
4014         qdisc_bstats_cpu_update(cl->q, skb);
4015
4016         switch (tc_classify(skb, cl, &cl_res, false)) {
4017         case TC_ACT_OK:
4018         case TC_ACT_RECLASSIFY:
4019                 skb->tc_index = TC_H_MIN(cl_res.classid);
4020                 break;
4021         case TC_ACT_SHOT:
4022                 qdisc_qstats_cpu_drop(cl->q);
4023                 kfree_skb(skb);
4024                 return NULL;
4025         case TC_ACT_STOLEN:
4026         case TC_ACT_QUEUED:
4027                 consume_skb(skb);
4028                 return NULL;
4029         case TC_ACT_REDIRECT:
4030                 /* skb_mac_header check was done by cls/act_bpf, so
4031                  * we can safely push the L2 header back before
4032                  * redirecting to another netdev
4033                  */
4034                 __skb_push(skb, skb->mac_len);
4035                 skb_do_redirect(skb);
4036                 return NULL;
4037         default:
4038                 break;
4039         }
4040 #endif /* CONFIG_NET_CLS_ACT */
4041         return skb;
4042 }
4043
4044 /**
4045  *      netdev_is_rx_handler_busy - check if receive handler is registered
4046  *      @dev: device to check
4047  *
4048  *      Check if a receive handler is already registered for a given device.
4049  *      Return true if there one.
4050  *
4051  *      The caller must hold the rtnl_mutex.
4052  */
4053 bool netdev_is_rx_handler_busy(struct net_device *dev)
4054 {
4055         ASSERT_RTNL();
4056         return dev && rtnl_dereference(dev->rx_handler);
4057 }
4058 EXPORT_SYMBOL_GPL(netdev_is_rx_handler_busy);
4059
4060 /**
4061  *      netdev_rx_handler_register - register receive handler
4062  *      @dev: device to register a handler for
4063  *      @rx_handler: receive handler to register
4064  *      @rx_handler_data: data pointer that is used by rx handler
4065  *
4066  *      Register a receive handler for a device. This handler will then be
4067  *      called from __netif_receive_skb. A negative errno code is returned
4068  *      on a failure.
4069  *
4070  *      The caller must hold the rtnl_mutex.
4071  *
4072  *      For a general description of rx_handler, see enum rx_handler_result.
4073  */
4074 int netdev_rx_handler_register(struct net_device *dev,
4075                                rx_handler_func_t *rx_handler,
4076                                void *rx_handler_data)
4077 {
4078         ASSERT_RTNL();
4079
4080         if (dev->rx_handler)
4081                 return -EBUSY;
4082
4083         /* Note: rx_handler_data must be set before rx_handler */
4084         rcu_assign_pointer(dev->rx_handler_data, rx_handler_data);
4085         rcu_assign_pointer(dev->rx_handler, rx_handler);
4086
4087         return 0;
4088 }
4089 EXPORT_SYMBOL_GPL(netdev_rx_handler_register);
4090
4091 /**
4092  *      netdev_rx_handler_unregister - unregister receive handler
4093  *      @dev: device to unregister a handler from
4094  *
4095  *      Unregister a receive handler from a device.
4096  *
4097  *      The caller must hold the rtnl_mutex.
4098  */
4099 void netdev_rx_handler_unregister(struct net_device *dev)
4100 {
4101
4102         ASSERT_RTNL();
4103         RCU_INIT_POINTER(dev->rx_handler, NULL);
4104         /* a reader seeing a non NULL rx_handler in a rcu_read_lock()
4105          * section has a guarantee to see a non NULL rx_handler_data
4106          * as well.
4107          */
4108         synchronize_net();
4109         RCU_INIT_POINTER(dev->rx_handler_data, NULL);
4110 }
4111 EXPORT_SYMBOL_GPL(netdev_rx_handler_unregister);
4112
4113 /*
4114  * Limit the use of PFMEMALLOC reserves to those protocols that implement
4115  * the special handling of PFMEMALLOC skbs.
4116  */
4117 static bool skb_pfmemalloc_protocol(struct sk_buff *skb)
4118 {
4119         switch (skb->protocol) {
4120         case htons(ETH_P_ARP):
4121         case htons(ETH_P_IP):
4122         case htons(ETH_P_IPV6):
4123         case htons(ETH_P_8021Q):
4124         case htons(ETH_P_8021AD):
4125                 return true;
4126         default:
4127                 return false;
4128         }
4129 }
4130
4131 static inline int nf_ingress(struct sk_buff *skb, struct packet_type **pt_prev,
4132                              int *ret, struct net_device *orig_dev)
4133 {
4134 #ifdef CONFIG_NETFILTER_INGRESS
4135         if (nf_hook_ingress_active(skb)) {
4136                 int ingress_retval;
4137
4138                 if (*pt_prev) {
4139                         *ret = deliver_skb(skb, *pt_prev, orig_dev);
4140                         *pt_prev = NULL;
4141                 }
4142
4143                 rcu_read_lock();
4144                 ingress_retval = nf_hook_ingress(skb);
4145                 rcu_read_unlock();
4146                 return ingress_retval;
4147         }
4148 #endif /* CONFIG_NETFILTER_INGRESS */
4149         return 0;
4150 }
4151
4152 static int __netif_receive_skb_core(struct sk_buff *skb, bool pfmemalloc)
4153 {
4154         struct packet_type *ptype, *pt_prev;
4155         rx_handler_func_t *rx_handler;
4156         struct net_device *orig_dev;
4157         bool deliver_exact = false;
4158         int ret = NET_RX_DROP;
4159         __be16 type;
4160
4161         net_timestamp_check(!netdev_tstamp_prequeue, skb);
4162
4163         trace_netif_receive_skb(skb);
4164
4165         orig_dev = skb->dev;
4166
4167         skb_reset_network_header(skb);
4168         if (!skb_transport_header_was_set(skb))
4169                 skb_reset_transport_header(skb);
4170         skb_reset_mac_len(skb);
4171
4172         pt_prev = NULL;
4173
4174 another_round:
4175         skb->skb_iif = skb->dev->ifindex;
4176
4177         __this_cpu_inc(softnet_data.processed);
4178
4179         if (skb->protocol == cpu_to_be16(ETH_P_8021Q) ||
4180             skb->protocol == cpu_to_be16(ETH_P_8021AD)) {
4181                 skb = skb_vlan_untag(skb);
4182                 if (unlikely(!skb))
4183                         goto out;
4184         }
4185
4186 #ifdef CONFIG_NET_CLS_ACT
4187         if (skb->tc_verd & TC_NCLS) {
4188                 skb->tc_verd = CLR_TC_NCLS(skb->tc_verd);
4189                 goto ncls;
4190         }
4191 #endif
4192
4193         if (pfmemalloc)
4194                 goto skip_taps;
4195
4196         list_for_each_entry_rcu(ptype, &ptype_all, list) {
4197                 if (pt_prev)
4198                         ret = deliver_skb(skb, pt_prev, orig_dev);
4199                 pt_prev = ptype;
4200         }
4201
4202         list_for_each_entry_rcu(ptype, &skb->dev->ptype_all, list) {
4203                 if (pt_prev)
4204                         ret = deliver_skb(skb, pt_prev, orig_dev);
4205                 pt_prev = ptype;
4206         }
4207
4208 skip_taps:
4209 #ifdef CONFIG_NET_INGRESS
4210         if (static_key_false(&ingress_needed)) {
4211                 skb = sch_handle_ingress(skb, &pt_prev, &ret, orig_dev);
4212                 if (!skb)
4213                         goto out;
4214
4215                 if (nf_ingress(skb, &pt_prev, &ret, orig_dev) < 0)
4216                         goto out;
4217         }
4218 #endif
4219 #ifdef CONFIG_NET_CLS_ACT
4220         skb->tc_verd = 0;
4221 ncls:
4222 #endif
4223         if (pfmemalloc && !skb_pfmemalloc_protocol(skb))
4224                 goto drop;
4225
4226         if (skb_vlan_tag_present(skb)) {
4227                 if (pt_prev) {
4228                         ret = deliver_skb(skb, pt_prev, orig_dev);
4229                         pt_prev = NULL;
4230                 }
4231                 if (vlan_do_receive(&skb))
4232                         goto another_round;
4233                 else if (unlikely(!skb))
4234                         goto out;
4235         }
4236
4237         rx_handler = rcu_dereference(skb->dev->rx_handler);
4238         if (rx_handler) {
4239                 if (pt_prev) {
4240                         ret = deliver_skb(skb, pt_prev, orig_dev);
4241                         pt_prev = NULL;
4242                 }
4243                 switch (rx_handler(&skb)) {
4244                 case RX_HANDLER_CONSUMED:
4245                         ret = NET_RX_SUCCESS;
4246                         goto out;
4247                 case RX_HANDLER_ANOTHER:
4248                         goto another_round;
4249                 case RX_HANDLER_EXACT:
4250                         deliver_exact = true;
4251                 case RX_HANDLER_PASS:
4252                         break;
4253                 default:
4254                         BUG();
4255                 }
4256         }
4257
4258         if (unlikely(skb_vlan_tag_present(skb))) {
4259                 if (skb_vlan_tag_get_id(skb))
4260                         skb->pkt_type = PACKET_OTHERHOST;
4261                 /* Note: we might in the future use prio bits
4262                  * and set skb->priority like in vlan_do_receive()
4263                  * For the time being, just ignore Priority Code Point
4264                  */
4265                 skb->vlan_tci = 0;
4266         }
4267
4268         type = skb->protocol;
4269
4270         /* deliver only exact match when indicated */
4271         if (likely(!deliver_exact)) {
4272                 deliver_ptype_list_skb(skb, &pt_prev, orig_dev, type,
4273                                        &ptype_base[ntohs(type) &
4274                                                    PTYPE_HASH_MASK]);
4275         }
4276
4277         deliver_ptype_list_skb(skb, &pt_prev, orig_dev, type,
4278                                &orig_dev->ptype_specific);
4279
4280         if (unlikely(skb->dev != orig_dev)) {
4281                 deliver_ptype_list_skb(skb, &pt_prev, orig_dev, type,
4282                                        &skb->dev->ptype_specific);
4283         }
4284
4285         if (pt_prev) {
4286                 if (unlikely(skb_orphan_frags(skb, GFP_ATOMIC)))
4287                         goto drop;
4288                 else
4289                         ret = pt_prev->func(skb, skb->dev, pt_prev, orig_dev);
4290         } else {
4291 drop:
4292                 if (!deliver_exact)
4293                         atomic_long_inc(&skb->dev->rx_dropped);
4294                 else
4295                         atomic_long_inc(&skb->dev->rx_nohandler);
4296                 kfree_skb(skb);
4297                 /* Jamal, now you will not able to escape explaining
4298                  * me how you were going to use this. :-)
4299                  */
4300                 ret = NET_RX_DROP;
4301         }
4302
4303 out:
4304         return ret;
4305 }
4306
4307 static int __netif_receive_skb(struct sk_buff *skb)
4308 {
4309         int ret;
4310
4311         if (sk_memalloc_socks() && skb_pfmemalloc(skb)) {
4312                 unsigned long pflags = current->flags;
4313
4314                 /*
4315                  * PFMEMALLOC skbs are special, they should
4316                  * - be delivered to SOCK_MEMALLOC sockets only
4317                  * - stay away from userspace
4318                  * - have bounded memory usage
4319                  *
4320                  * Use PF_MEMALLOC as this saves us from propagating the allocation
4321                  * context down to all allocation sites.
4322                  */
4323                 current->flags |= PF_MEMALLOC;
4324                 ret = __netif_receive_skb_core(skb, true);
4325                 tsk_restore_flags(current, pflags, PF_MEMALLOC);
4326         } else
4327                 ret = __netif_receive_skb_core(skb, false);
4328
4329         return ret;
4330 }
4331
4332 static int netif_receive_skb_internal(struct sk_buff *skb)
4333 {
4334         int ret;
4335
4336         net_timestamp_check(netdev_tstamp_prequeue, skb);
4337
4338         if (skb_defer_rx_timestamp(skb))
4339                 return NET_RX_SUCCESS;
4340
4341         rcu_read_lock();
4342
4343 #ifdef CONFIG_RPS
4344         if (static_key_false(&rps_needed)) {
4345                 struct rps_dev_flow voidflow, *rflow = &voidflow;
4346                 int cpu = get_rps_cpu(skb->dev, skb, &rflow);
4347
4348                 if (cpu >= 0) {
4349                         ret = enqueue_to_backlog(skb, cpu, &rflow->last_qtail);
4350                         rcu_read_unlock();
4351                         return ret;
4352                 }
4353         }
4354 #endif
4355         ret = __netif_receive_skb(skb);
4356         rcu_read_unlock();
4357         return ret;
4358 }
4359
4360 /**
4361  *      netif_receive_skb - process receive buffer from network
4362  *      @skb: buffer to process
4363  *
4364  *      netif_receive_skb() is the main receive data processing function.
4365  *      It always succeeds. The buffer may be dropped during processing
4366  *      for congestion control or by the protocol layers.
4367  *
4368  *      This function may only be called from softirq context and interrupts
4369  *      should be enabled.
4370  *
4371  *      Return values (usually ignored):
4372  *      NET_RX_SUCCESS: no congestion
4373  *      NET_RX_DROP: packet was dropped
4374  */
4375 int netif_receive_skb(struct sk_buff *skb)
4376 {
4377         trace_netif_receive_skb_entry(skb);
4378
4379         return netif_receive_skb_internal(skb);
4380 }
4381 EXPORT_SYMBOL(netif_receive_skb);
4382
4383 DEFINE_PER_CPU(struct work_struct, flush_works);
4384
4385 /* Network device is going away, flush any packets still pending */
4386 static void flush_backlog(struct work_struct *work)
4387 {
4388         struct sk_buff *skb, *tmp;
4389         struct softnet_data *sd;
4390
4391         local_bh_disable();
4392         sd = this_cpu_ptr(&softnet_data);
4393
4394         local_irq_disable();
4395         rps_lock(sd);
4396         skb_queue_walk_safe(&sd->input_pkt_queue, skb, tmp) {
4397                 if (skb->dev->reg_state == NETREG_UNREGISTERING) {
4398                         __skb_unlink(skb, &sd->input_pkt_queue);
4399                         kfree_skb(skb);
4400                         input_queue_head_incr(sd);
4401                 }
4402         }
4403         rps_unlock(sd);
4404         local_irq_enable();
4405
4406         skb_queue_walk_safe(&sd->process_queue, skb, tmp) {
4407                 if (skb->dev->reg_state == NETREG_UNREGISTERING) {
4408                         __skb_unlink(skb, &sd->process_queue);
4409                         kfree_skb(skb);
4410                         input_queue_head_incr(sd);
4411                 }
4412         }
4413         local_bh_enable();
4414 }
4415
4416 static void flush_all_backlogs(void)
4417 {
4418         unsigned int cpu;
4419
4420         get_online_cpus();
4421
4422         for_each_online_cpu(cpu)
4423                 queue_work_on(cpu, system_highpri_wq,
4424                               per_cpu_ptr(&flush_works, cpu));
4425
4426         for_each_online_cpu(cpu)
4427                 flush_work(per_cpu_ptr(&flush_works, cpu));
4428
4429         put_online_cpus();
4430 }
4431
4432 static int napi_gro_complete(struct sk_buff *skb)
4433 {
4434         struct packet_offload *ptype;
4435         __be16 type = skb->protocol;
4436         struct list_head *head = &offload_base;
4437         int err = -ENOENT;
4438
4439         BUILD_BUG_ON(sizeof(struct napi_gro_cb) > sizeof(skb->cb));
4440
4441         if (NAPI_GRO_CB(skb)->count == 1) {
4442                 skb_shinfo(skb)->gso_size = 0;
4443                 goto out;
4444         }
4445
4446         rcu_read_lock();
4447         list_for_each_entry_rcu(ptype, head, list) {
4448                 if (ptype->type != type || !ptype->callbacks.gro_complete)
4449                         continue;
4450
4451                 err = ptype->callbacks.gro_complete(skb, 0);
4452                 break;
4453         }
4454         rcu_read_unlock();
4455
4456         if (err) {
4457                 WARN_ON(&ptype->list == head);
4458                 kfree_skb(skb);
4459                 return NET_RX_SUCCESS;
4460         }
4461
4462 out:
4463         return netif_receive_skb_internal(skb);
4464 }
4465
4466 /* napi->gro_list contains packets ordered by age.
4467  * youngest packets at the head of it.
4468  * Complete skbs in reverse order to reduce latencies.
4469  */
4470 void napi_gro_flush(struct napi_struct *napi, bool flush_old)
4471 {
4472         struct sk_buff *skb, *prev = NULL;
4473
4474         /* scan list and build reverse chain */
4475         for (skb = napi->gro_list; skb != NULL; skb = skb->next) {
4476                 skb->prev = prev;
4477                 prev = skb;
4478         }
4479
4480         for (skb = prev; skb; skb = prev) {
4481                 skb->next = NULL;
4482
4483                 if (flush_old && NAPI_GRO_CB(skb)->age == jiffies)
4484                         return;
4485
4486                 prev = skb->prev;
4487                 napi_gro_complete(skb);
4488                 napi->gro_count--;
4489         }
4490
4491         napi->gro_list = NULL;
4492 }
4493 EXPORT_SYMBOL(napi_gro_flush);
4494
4495 static void gro_list_prepare(struct napi_struct *napi, struct sk_buff *skb)
4496 {
4497         struct sk_buff *p;
4498         unsigned int maclen = skb->dev->hard_header_len;
4499         u32 hash = skb_get_hash_raw(skb);
4500
4501         for (p = napi->gro_list; p; p = p->next) {
4502                 unsigned long diffs;
4503
4504                 NAPI_GRO_CB(p)->flush = 0;
4505
4506                 if (hash != skb_get_hash_raw(p)) {
4507                         NAPI_GRO_CB(p)->same_flow = 0;
4508                         continue;
4509                 }
4510
4511                 diffs = (unsigned long)p->dev ^ (unsigned long)skb->dev;
4512                 diffs |= p->vlan_tci ^ skb->vlan_tci;
4513                 diffs |= skb_metadata_dst_cmp(p, skb);
4514                 if (maclen == ETH_HLEN)
4515                         diffs |= compare_ether_header(skb_mac_header(p),
4516                                                       skb_mac_header(skb));
4517                 else if (!diffs)
4518                         diffs = memcmp(skb_mac_header(p),
4519                                        skb_mac_header(skb),
4520                                        maclen);
4521                 NAPI_GRO_CB(p)->same_flow = !diffs;
4522         }
4523 }
4524
4525 static void skb_gro_reset_offset(struct sk_buff *skb)
4526 {
4527         const struct skb_shared_info *pinfo = skb_shinfo(skb);
4528         const skb_frag_t *frag0 = &pinfo->frags[0];
4529
4530         NAPI_GRO_CB(skb)->data_offset = 0;
4531         NAPI_GRO_CB(skb)->frag0 = NULL;
4532         NAPI_GRO_CB(skb)->frag0_len = 0;
4533
4534         if (skb_mac_header(skb) == skb_tail_pointer(skb) &&
4535             pinfo->nr_frags &&
4536             !PageHighMem(skb_frag_page(frag0))) {
4537                 NAPI_GRO_CB(skb)->frag0 = skb_frag_address(frag0);
4538                 NAPI_GRO_CB(skb)->frag0_len = min_t(unsigned int,
4539                                                     skb_frag_size(frag0),
4540                                                     skb->end - skb->tail);
4541         }
4542 }
4543
4544 static void gro_pull_from_frag0(struct sk_buff *skb, int grow)
4545 {
4546         struct skb_shared_info *pinfo = skb_shinfo(skb);
4547
4548         BUG_ON(skb->end - skb->tail < grow);
4549
4550         memcpy(skb_tail_pointer(skb), NAPI_GRO_CB(skb)->frag0, grow);
4551
4552         skb->data_len -= grow;
4553         skb->tail += grow;
4554
4555         pinfo->frags[0].page_offset += grow;
4556         skb_frag_size_sub(&pinfo->frags[0], grow);
4557
4558         if (unlikely(!skb_frag_size(&pinfo->frags[0]))) {
4559                 skb_frag_unref(skb, 0);
4560                 memmove(pinfo->frags, pinfo->frags + 1,
4561                         --pinfo->nr_frags * sizeof(pinfo->frags[0]));
4562         }
4563 }
4564
4565 static enum gro_result dev_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
4566 {
4567         struct sk_buff **pp = NULL;
4568         struct packet_offload *ptype;
4569         __be16 type = skb->protocol;
4570         struct list_head *head = &offload_base;
4571         int same_flow;
4572         enum gro_result ret;
4573         int grow;
4574
4575         if (!(skb->dev->features & NETIF_F_GRO))
4576                 goto normal;
4577
4578         if (skb_is_gso(skb) || skb_has_frag_list(skb) || skb->csum_bad)
4579                 goto normal;
4580
4581         gro_list_prepare(napi, skb);
4582
4583         rcu_read_lock();
4584         list_for_each_entry_rcu(ptype, head, list) {
4585                 if (ptype->type != type || !ptype->callbacks.gro_receive)
4586                         continue;
4587
4588                 skb_set_network_header(skb, skb_gro_offset(skb));
4589                 skb_reset_mac_len(skb);
4590                 NAPI_GRO_CB(skb)->same_flow = 0;
4591                 NAPI_GRO_CB(skb)->flush = 0;
4592                 NAPI_GRO_CB(skb)->free = 0;
4593                 NAPI_GRO_CB(skb)->encap_mark = 0;
4594                 NAPI_GRO_CB(skb)->recursion_counter = 0;
4595                 NAPI_GRO_CB(skb)->is_fou = 0;
4596                 NAPI_GRO_CB(skb)->is_atomic = 1;
4597                 NAPI_GRO_CB(skb)->gro_remcsum_start = 0;
4598
4599                 /* Setup for GRO checksum validation */
4600                 switch (skb->ip_summed) {
4601                 case CHECKSUM_COMPLETE:
4602                         NAPI_GRO_CB(skb)->csum = skb->csum;
4603                         NAPI_GRO_CB(skb)->csum_valid = 1;
4604                         NAPI_GRO_CB(skb)->csum_cnt = 0;
4605                         break;
4606                 case CHECKSUM_UNNECESSARY:
4607                         NAPI_GRO_CB(skb)->csum_cnt = skb->csum_level + 1;
4608                         NAPI_GRO_CB(skb)->csum_valid = 0;
4609                         break;
4610                 default:
4611                         NAPI_GRO_CB(skb)->csum_cnt = 0;
4612                         NAPI_GRO_CB(skb)->csum_valid = 0;
4613                 }
4614
4615                 pp = ptype->callbacks.gro_receive(&napi->gro_list, skb);
4616                 break;
4617         }
4618         rcu_read_unlock();
4619
4620         if (&ptype->list == head)
4621                 goto normal;
4622
4623         same_flow = NAPI_GRO_CB(skb)->same_flow;
4624         ret = NAPI_GRO_CB(skb)->free ? GRO_MERGED_FREE : GRO_MERGED;
4625
4626         if (pp) {
4627                 struct sk_buff *nskb = *pp;
4628
4629                 *pp = nskb->next;
4630                 nskb->next = NULL;
4631                 napi_gro_complete(nskb);
4632                 napi->gro_count--;
4633         }
4634
4635         if (same_flow)
4636                 goto ok;
4637
4638         if (NAPI_GRO_CB(skb)->flush)
4639                 goto normal;
4640
4641         if (unlikely(napi->gro_count >= MAX_GRO_SKBS)) {
4642                 struct sk_buff *nskb = napi->gro_list;
4643
4644                 /* locate the end of the list to select the 'oldest' flow */
4645                 while (nskb->next) {
4646                         pp = &nskb->next;
4647                         nskb = *pp;
4648                 }
4649                 *pp = NULL;
4650                 nskb->next = NULL;
4651                 napi_gro_complete(nskb);
4652         } else {
4653                 napi->gro_count++;
4654         }
4655         NAPI_GRO_CB(skb)->count = 1;
4656         NAPI_GRO_CB(skb)->age = jiffies;
4657         NAPI_GRO_CB(skb)->last = skb;
4658         skb_shinfo(skb)->gso_size = skb_gro_len(skb);
4659         skb->next = napi->gro_list;
4660         napi->gro_list = skb;
4661         ret = GRO_HELD;
4662
4663 pull:
4664         grow = skb_gro_offset(skb) - skb_headlen(skb);
4665         if (grow > 0)
4666                 gro_pull_from_frag0(skb, grow);
4667 ok:
4668         return ret;
4669
4670 normal:
4671         ret = GRO_NORMAL;
4672         goto pull;
4673 }
4674
4675 struct packet_offload *gro_find_receive_by_type(__be16 type)
4676 {
4677         struct list_head *offload_head = &offload_base;
4678         struct packet_offload *ptype;
4679
4680         list_for_each_entry_rcu(ptype, offload_head, list) {
4681                 if (ptype->type != type || !ptype->callbacks.gro_receive)
4682                         continue;
4683                 return ptype;
4684         }
4685         return NULL;
4686 }
4687 EXPORT_SYMBOL(gro_find_receive_by_type);
4688
4689 struct packet_offload *gro_find_complete_by_type(__be16 type)
4690 {
4691         struct list_head *offload_head = &offload_base;
4692         struct packet_offload *ptype;
4693
4694         list_for_each_entry_rcu(ptype, offload_head, list) {
4695                 if (ptype->type != type || !ptype->callbacks.gro_complete)
4696                         continue;
4697                 return ptype;
4698         }
4699         return NULL;
4700 }
4701 EXPORT_SYMBOL(gro_find_complete_by_type);
4702
4703 static void napi_skb_free_stolen_head(struct sk_buff *skb)
4704 {
4705         skb_dst_drop(skb);
4706         kmem_cache_free(skbuff_head_cache, skb);
4707 }
4708
4709 static gro_result_t napi_skb_finish(gro_result_t ret, struct sk_buff *skb)
4710 {
4711         switch (ret) {
4712         case GRO_NORMAL:
4713                 if (netif_receive_skb_internal(skb))
4714                         ret = GRO_DROP;
4715                 break;
4716
4717         case GRO_DROP:
4718                 kfree_skb(skb);
4719                 break;
4720
4721         case GRO_MERGED_FREE:
4722                 if (NAPI_GRO_CB(skb)->free == NAPI_GRO_FREE_STOLEN_HEAD)
4723                         napi_skb_free_stolen_head(skb);
4724                 else
4725                         __kfree_skb(skb);
4726                 break;
4727
4728         case GRO_HELD:
4729         case GRO_MERGED:
4730                 break;
4731         }
4732
4733         return ret;
4734 }
4735
4736 gro_result_t napi_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
4737 {
4738         skb_mark_napi_id(skb, napi);
4739         trace_napi_gro_receive_entry(skb);
4740
4741         skb_gro_reset_offset(skb);
4742
4743         return napi_skb_finish(dev_gro_receive(napi, skb), skb);
4744 }
4745 EXPORT_SYMBOL(napi_gro_receive);
4746
4747 static void napi_reuse_skb(struct napi_struct *napi, struct sk_buff *skb)
4748 {
4749         if (unlikely(skb->pfmemalloc)) {
4750                 consume_skb(skb);
4751                 return;
4752         }
4753         __skb_pull(skb, skb_headlen(skb));
4754         /* restore the reserve we had after netdev_alloc_skb_ip_align() */
4755         skb_reserve(skb, NET_SKB_PAD + NET_IP_ALIGN - skb_headroom(skb));
4756         skb->vlan_tci = 0;
4757         skb->dev = napi->dev;
4758         skb->skb_iif = 0;
4759
4760         /* eth_type_trans() assumes pkt_type is PACKET_HOST */
4761         skb->pkt_type = PACKET_HOST;
4762
4763         skb->encapsulation = 0;
4764         skb_shinfo(skb)->gso_type = 0;
4765         skb->truesize = SKB_TRUESIZE(skb_end_offset(skb));
4766
4767         napi->skb = skb;
4768 }
4769
4770 struct sk_buff *napi_get_frags(struct napi_struct *napi)
4771 {
4772         struct sk_buff *skb = napi->skb;
4773
4774         if (!skb) {
4775                 skb = napi_alloc_skb(napi, GRO_MAX_HEAD);
4776                 if (skb) {
4777                         napi->skb = skb;
4778                         skb_mark_napi_id(skb, napi);
4779                 }
4780         }
4781         return skb;
4782 }
4783 EXPORT_SYMBOL(napi_get_frags);
4784
4785 static gro_result_t napi_frags_finish(struct napi_struct *napi,
4786                                       struct sk_buff *skb,
4787                                       gro_result_t ret)
4788 {
4789         switch (ret) {
4790         case GRO_NORMAL:
4791         case GRO_HELD:
4792                 __skb_push(skb, ETH_HLEN);
4793                 skb->protocol = eth_type_trans(skb, skb->dev);
4794                 if (ret == GRO_NORMAL && netif_receive_skb_internal(skb))
4795                         ret = GRO_DROP;
4796                 break;
4797
4798         case GRO_DROP:
4799                 napi_reuse_skb(napi, skb);
4800                 break;
4801
4802         case GRO_MERGED_FREE:
4803                 if (NAPI_GRO_CB(skb)->free == NAPI_GRO_FREE_STOLEN_HEAD)
4804                         napi_skb_free_stolen_head(skb);
4805                 else
4806                         napi_reuse_skb(napi, skb);
4807                 break;
4808
4809         case GRO_MERGED:
4810                 break;
4811         }
4812
4813         return ret;
4814 }
4815
4816 /* Upper GRO stack assumes network header starts at gro_offset=0
4817  * Drivers could call both napi_gro_frags() and napi_gro_receive()
4818  * We copy ethernet header into skb->data to have a common layout.
4819  */
4820 static struct sk_buff *napi_frags_skb(struct napi_struct *napi)
4821 {
4822         struct sk_buff *skb = napi->skb;
4823         const struct ethhdr *eth;
4824         unsigned int hlen = sizeof(*eth);
4825
4826         napi->skb = NULL;
4827
4828         skb_reset_mac_header(skb);
4829         skb_gro_reset_offset(skb);
4830
4831         eth = skb_gro_header_fast(skb, 0);
4832         if (unlikely(skb_gro_header_hard(skb, hlen))) {
4833                 eth = skb_gro_header_slow(skb, hlen, 0);
4834                 if (unlikely(!eth)) {
4835                         net_warn_ratelimited("%s: dropping impossible skb from %s\n",
4836                                              __func__, napi->dev->name);
4837                         napi_reuse_skb(napi, skb);
4838                         return NULL;
4839                 }
4840         } else {
4841                 gro_pull_from_frag0(skb, hlen);
4842                 NAPI_GRO_CB(skb)->frag0 += hlen;
4843                 NAPI_GRO_CB(skb)->frag0_len -= hlen;
4844         }
4845         __skb_pull(skb, hlen);
4846
4847         /*
4848          * This works because the only protocols we care about don't require
4849          * special handling.
4850          * We'll fix it up properly in napi_frags_finish()
4851          */
4852         skb->protocol = eth->h_proto;
4853
4854         return skb;
4855 }
4856
4857 gro_result_t napi_gro_frags(struct napi_struct *napi)
4858 {
4859         struct sk_buff *skb = napi_frags_skb(napi);
4860
4861         if (!skb)
4862                 return GRO_DROP;
4863
4864         trace_napi_gro_frags_entry(skb);
4865
4866         return napi_frags_finish(napi, skb, dev_gro_receive(napi, skb));
4867 }
4868 EXPORT_SYMBOL(napi_gro_frags);
4869
4870 /* Compute the checksum from gro_offset and return the folded value
4871  * after adding in any pseudo checksum.
4872  */
4873 __sum16 __skb_gro_checksum_complete(struct sk_buff *skb)
4874 {
4875         __wsum wsum;
4876         __sum16 sum;
4877
4878         wsum = skb_checksum(skb, skb_gro_offset(skb), skb_gro_len(skb), 0);
4879
4880         /* NAPI_GRO_CB(skb)->csum holds pseudo checksum */
4881         sum = csum_fold(csum_add(NAPI_GRO_CB(skb)->csum, wsum));
4882         if (likely(!sum)) {
4883                 if (unlikely(skb->ip_summed == CHECKSUM_COMPLETE) &&
4884                     !skb->csum_complete_sw)
4885                         netdev_rx_csum_fault(skb->dev);
4886         }
4887
4888         NAPI_GRO_CB(skb)->csum = wsum;
4889         NAPI_GRO_CB(skb)->csum_valid = 1;
4890
4891         return sum;
4892 }
4893 EXPORT_SYMBOL(__skb_gro_checksum_complete);
4894
4895 /*
4896  * net_rps_action_and_irq_enable sends any pending IPI's for rps.
4897  * Note: called with local irq disabled, but exits with local irq enabled.
4898  */
4899 static void net_rps_action_and_irq_enable(struct softnet_data *sd)
4900 {
4901 #ifdef CONFIG_RPS
4902         struct softnet_data *remsd = sd->rps_ipi_list;
4903
4904         if (remsd) {
4905                 sd->rps_ipi_list = NULL;
4906
4907                 local_irq_enable();
4908
4909                 /* Send pending IPI's to kick RPS processing on remote cpus. */
4910                 while (remsd) {
4911                         struct softnet_data *next = remsd->rps_ipi_next;
4912
4913                         if (cpu_online(remsd->cpu))
4914                                 smp_call_function_single_async(remsd->cpu,
4915                                                            &remsd->csd);
4916                         remsd = next;
4917                 }
4918         } else
4919 #endif
4920                 local_irq_enable();
4921 }
4922
4923 static bool sd_has_rps_ipi_waiting(struct softnet_data *sd)
4924 {
4925 #ifdef CONFIG_RPS
4926         return sd->rps_ipi_list != NULL;
4927 #else
4928         return false;
4929 #endif
4930 }
4931
4932 static int process_backlog(struct napi_struct *napi, int quota)
4933 {
4934         struct softnet_data *sd = container_of(napi, struct softnet_data, backlog);
4935         bool again = true;
4936         int work = 0;
4937
4938         /* Check if we have pending ipi, its better to send them now,
4939          * not waiting net_rx_action() end.
4940          */
4941         if (sd_has_rps_ipi_waiting(sd)) {
4942                 local_irq_disable();
4943                 net_rps_action_and_irq_enable(sd);
4944         }
4945
4946         napi->weight = weight_p;
4947         while (again) {
4948                 struct sk_buff *skb;
4949
4950                 while ((skb = __skb_dequeue(&sd->process_queue))) {
4951                         rcu_read_lock();
4952                         __netif_receive_skb(skb);
4953                         rcu_read_unlock();
4954                         input_queue_head_incr(sd);
4955                         if (++work >= quota)
4956                                 return work;
4957
4958                 }
4959
4960                 local_irq_disable();
4961                 rps_lock(sd);
4962                 if (skb_queue_empty(&sd->input_pkt_queue)) {
4963                         /*
4964                          * Inline a custom version of __napi_complete().
4965                          * only current cpu owns and manipulates this napi,
4966                          * and NAPI_STATE_SCHED is the only possible flag set
4967                          * on backlog.
4968                          * We can use a plain write instead of clear_bit(),
4969                          * and we dont need an smp_mb() memory barrier.
4970                          */
4971                         napi->state = 0;
4972                         again = false;
4973                 } else {
4974                         skb_queue_splice_tail_init(&sd->input_pkt_queue,
4975                                                    &sd->process_queue);
4976                 }
4977                 rps_unlock(sd);
4978                 local_irq_enable();
4979         }
4980
4981         return work;
4982 }
4983
4984 /**
4985  * __napi_schedule - schedule for receive
4986  * @n: entry to schedule
4987  *
4988  * The entry's receive function will be scheduled to run.
4989  * Consider using __napi_schedule_irqoff() if hard irqs are masked.
4990  */
4991 void __napi_schedule(struct napi_struct *n)
4992 {
4993         unsigned long flags;
4994
4995         local_irq_save(flags);
4996         ____napi_schedule(this_cpu_ptr(&softnet_data), n);
4997         local_irq_restore(flags);
4998 }
4999 EXPORT_SYMBOL(__napi_schedule);
5000
5001 /**
5002  * __napi_schedule_irqoff - schedule for receive
5003  * @n: entry to schedule
5004  *
5005  * Variant of __napi_schedule() assuming hard irqs are masked
5006  */
5007 void __napi_schedule_irqoff(struct napi_struct *n)
5008 {
5009         ____napi_schedule(this_cpu_ptr(&softnet_data), n);
5010 }
5011 EXPORT_SYMBOL(__napi_schedule_irqoff);
5012
5013 void __napi_complete(struct napi_struct *n)
5014 {
5015         BUG_ON(!test_bit(NAPI_STATE_SCHED, &n->state));
5016
5017         list_del_init(&n->poll_list);
5018         smp_mb__before_atomic();
5019         clear_bit(NAPI_STATE_SCHED, &n->state);
5020 }
5021 EXPORT_SYMBOL(__napi_complete);
5022
5023 void napi_complete_done(struct napi_struct *n, int work_done)
5024 {
5025         unsigned long flags;
5026
5027         /*
5028          * don't let napi dequeue from the cpu poll list
5029          * just in case its running on a different cpu
5030          */
5031         if (unlikely(test_bit(NAPI_STATE_NPSVC, &n->state)))
5032                 return;
5033
5034         if (n->gro_list) {
5035                 unsigned long timeout = 0;
5036
5037                 if (work_done)
5038                         timeout = n->dev->gro_flush_timeout;
5039
5040                 if (timeout)
5041                         hrtimer_start(&n->timer, ns_to_ktime(timeout),
5042                                       HRTIMER_MODE_REL_PINNED);
5043                 else
5044                         napi_gro_flush(n, false);
5045         }
5046         if (likely(list_empty(&n->poll_list))) {
5047                 WARN_ON_ONCE(!test_and_clear_bit(NAPI_STATE_SCHED, &n->state));
5048         } else {
5049                 /* If n->poll_list is not empty, we need to mask irqs */
5050                 local_irq_save(flags);
5051                 __napi_complete(n);
5052                 local_irq_restore(flags);
5053         }
5054 }
5055 EXPORT_SYMBOL(napi_complete_done);
5056
5057 /* must be called under rcu_read_lock(), as we dont take a reference */
5058 static struct napi_struct *napi_by_id(unsigned int napi_id)
5059 {
5060         unsigned int hash = napi_id % HASH_SIZE(napi_hash);
5061         struct napi_struct *napi;
5062
5063         hlist_for_each_entry_rcu(napi, &napi_hash[hash], napi_hash_node)
5064                 if (napi->napi_id == napi_id)
5065                         return napi;
5066
5067         return NULL;
5068 }
5069
5070 #if defined(CONFIG_NET_RX_BUSY_POLL)
5071 #define BUSY_POLL_BUDGET 8
5072 bool sk_busy_loop(struct sock *sk, int nonblock)
5073 {
5074         unsigned long end_time = !nonblock ? sk_busy_loop_end_time(sk) : 0;
5075         int (*busy_poll)(struct napi_struct *dev);
5076         struct napi_struct *napi;
5077         int rc = false;
5078
5079         rcu_read_lock();
5080
5081         napi = napi_by_id(sk->sk_napi_id);
5082         if (!napi)
5083                 goto out;
5084
5085         /* Note: ndo_busy_poll method is optional in linux-4.5 */
5086         busy_poll = napi->dev->netdev_ops->ndo_busy_poll;
5087
5088         do {
5089                 rc = 0;
5090                 local_bh_disable();
5091                 if (busy_poll) {
5092                         rc = busy_poll(napi);
5093                 } else if (napi_schedule_prep(napi)) {
5094                         void *have = netpoll_poll_lock(napi);
5095
5096                         if (test_bit(NAPI_STATE_SCHED, &napi->state)) {
5097                                 rc = napi->poll(napi, BUSY_POLL_BUDGET);
5098                                 trace_napi_poll(napi, rc, BUSY_POLL_BUDGET);
5099                                 if (rc == BUSY_POLL_BUDGET) {
5100                                         napi_complete_done(napi, rc);
5101                                         napi_schedule(napi);
5102                                 }
5103                         }
5104                         netpoll_poll_unlock(have);
5105                 }
5106                 if (rc > 0)
5107                         __NET_ADD_STATS(sock_net(sk),
5108                                         LINUX_MIB_BUSYPOLLRXPACKETS, rc);
5109                 local_bh_enable();
5110
5111                 if (rc == LL_FLUSH_FAILED)
5112                         break; /* permanent failure */
5113
5114                 cpu_relax();
5115         } while (!nonblock && skb_queue_empty(&sk->sk_receive_queue) &&
5116                  !need_resched() && !busy_loop_timeout(end_time));
5117
5118         rc = !skb_queue_empty(&sk->sk_receive_queue);
5119 out:
5120         rcu_read_unlock();
5121         return rc;
5122 }
5123 EXPORT_SYMBOL(sk_busy_loop);
5124
5125 #endif /* CONFIG_NET_RX_BUSY_POLL */
5126
5127 void napi_hash_add(struct napi_struct *napi)
5128 {
5129         if (test_bit(NAPI_STATE_NO_BUSY_POLL, &napi->state) ||
5130             test_and_set_bit(NAPI_STATE_HASHED, &napi->state))
5131                 return;
5132
5133         spin_lock(&napi_hash_lock);
5134
5135         /* 0..NR_CPUS+1 range is reserved for sender_cpu use */
5136         do {
5137                 if (unlikely(++napi_gen_id < NR_CPUS + 1))
5138                         napi_gen_id = NR_CPUS + 1;
5139         } while (napi_by_id(napi_gen_id));
5140         napi->napi_id = napi_gen_id;
5141
5142         hlist_add_head_rcu(&napi->napi_hash_node,
5143                            &napi_hash[napi->napi_id % HASH_SIZE(napi_hash)]);
5144
5145         spin_unlock(&napi_hash_lock);
5146 }
5147 EXPORT_SYMBOL_GPL(napi_hash_add);
5148
5149 /* Warning : caller is responsible to make sure rcu grace period
5150  * is respected before freeing memory containing @napi
5151  */
5152 bool napi_hash_del(struct napi_struct *napi)
5153 {
5154         bool rcu_sync_needed = false;
5155
5156         spin_lock(&napi_hash_lock);
5157
5158         if (test_and_clear_bit(NAPI_STATE_HASHED, &napi->state)) {
5159                 rcu_sync_needed = true;
5160                 hlist_del_rcu(&napi->napi_hash_node);
5161         }
5162         spin_unlock(&napi_hash_lock);
5163         return rcu_sync_needed;
5164 }
5165 EXPORT_SYMBOL_GPL(napi_hash_del);
5166
5167 static enum hrtimer_restart napi_watchdog(struct hrtimer *timer)
5168 {
5169         struct napi_struct *napi;
5170
5171         napi = container_of(timer, struct napi_struct, timer);
5172         if (napi->gro_list)
5173                 napi_schedule(napi);
5174
5175         return HRTIMER_NORESTART;
5176 }
5177
5178 void netif_napi_add(struct net_device *dev, struct napi_struct *napi,
5179                     int (*poll)(struct napi_struct *, int), int weight)
5180 {
5181         INIT_LIST_HEAD(&napi->poll_list);
5182         hrtimer_init(&napi->timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL_PINNED);
5183         napi->timer.function = napi_watchdog;
5184         napi->gro_count = 0;
5185         napi->gro_list = NULL;
5186         napi->skb = NULL;
5187         napi->poll = poll;
5188         if (weight > NAPI_POLL_WEIGHT)
5189                 pr_err_once("netif_napi_add() called with weight %d on device %s\n",
5190                             weight, dev->name);
5191         napi->weight = weight;
5192         list_add(&napi->dev_list, &dev->napi_list);
5193         napi->dev = dev;
5194 #ifdef CONFIG_NETPOLL
5195         spin_lock_init(&napi->poll_lock);
5196         napi->poll_owner = -1;
5197 #endif
5198         set_bit(NAPI_STATE_SCHED, &napi->state);
5199         napi_hash_add(napi);
5200 }
5201 EXPORT_SYMBOL(netif_napi_add);
5202
5203 void napi_disable(struct napi_struct *n)
5204 {
5205         might_sleep();
5206         set_bit(NAPI_STATE_DISABLE, &n->state);
5207
5208         while (test_and_set_bit(NAPI_STATE_SCHED, &n->state))
5209                 msleep(1);
5210         while (test_and_set_bit(NAPI_STATE_NPSVC, &n->state))
5211                 msleep(1);
5212
5213         hrtimer_cancel(&n->timer);
5214
5215         clear_bit(NAPI_STATE_DISABLE, &n->state);
5216 }
5217 EXPORT_SYMBOL(napi_disable);
5218
5219 /* Must be called in process context */
5220 void netif_napi_del(struct napi_struct *napi)
5221 {
5222         might_sleep();
5223         if (napi_hash_del(napi))
5224                 synchronize_net();
5225         list_del_init(&napi->dev_list);
5226         napi_free_frags(napi);
5227
5228         kfree_skb_list(napi->gro_list);
5229         napi->gro_list = NULL;
5230         napi->gro_count = 0;
5231 }
5232 EXPORT_SYMBOL(netif_napi_del);
5233
5234 static int napi_poll(struct napi_struct *n, struct list_head *repoll)
5235 {
5236         void *have;
5237         int work, weight;
5238
5239         list_del_init(&n->poll_list);
5240
5241         have = netpoll_poll_lock(n);
5242
5243         weight = n->weight;
5244
5245         /* This NAPI_STATE_SCHED test is for avoiding a race
5246          * with netpoll's poll_napi().  Only the entity which
5247          * obtains the lock and sees NAPI_STATE_SCHED set will
5248          * actually make the ->poll() call.  Therefore we avoid
5249          * accidentally calling ->poll() when NAPI is not scheduled.
5250          */
5251         work = 0;
5252         if (test_bit(NAPI_STATE_SCHED, &n->state)) {
5253                 work = n->poll(n, weight);
5254                 trace_napi_poll(n, work, weight);
5255         }
5256
5257         WARN_ON_ONCE(work > weight);
5258
5259         if (likely(work < weight))
5260                 goto out_unlock;
5261
5262         /* Drivers must not modify the NAPI state if they
5263          * consume the entire weight.  In such cases this code
5264          * still "owns" the NAPI instance and therefore can
5265          * move the instance around on the list at-will.
5266          */
5267         if (unlikely(napi_disable_pending(n))) {
5268                 napi_complete(n);
5269                 goto out_unlock;
5270         }
5271
5272         if (n->gro_list) {
5273                 /* flush too old packets
5274                  * If HZ < 1000, flush all packets.
5275                  */
5276                 napi_gro_flush(n, HZ >= 1000);
5277         }
5278
5279         /* Some drivers may have called napi_schedule
5280          * prior to exhausting their budget.
5281          */
5282         if (unlikely(!list_empty(&n->poll_list))) {
5283                 pr_warn_once("%s: Budget exhausted after napi rescheduled\n",
5284                              n->dev ? n->dev->name : "backlog");
5285                 goto out_unlock;
5286         }
5287
5288         list_add_tail(&n->poll_list, repoll);
5289
5290 out_unlock:
5291         netpoll_poll_unlock(have);
5292
5293         return work;
5294 }
5295
5296 static __latent_entropy void net_rx_action(struct softirq_action *h)
5297 {
5298         struct softnet_data *sd = this_cpu_ptr(&softnet_data);
5299         unsigned long time_limit = jiffies + 2;
5300         int budget = netdev_budget;
5301         LIST_HEAD(list);
5302         LIST_HEAD(repoll);
5303
5304         local_irq_disable();
5305         list_splice_init(&sd->poll_list, &list);
5306         local_irq_enable();
5307
5308         for (;;) {
5309                 struct napi_struct *n;
5310
5311                 if (list_empty(&list)) {
5312                         if (!sd_has_rps_ipi_waiting(sd) && list_empty(&repoll))
5313                                 return;
5314                         break;
5315                 }
5316
5317                 n = list_first_entry(&list, struct napi_struct, poll_list);
5318                 budget -= napi_poll(n, &repoll);
5319
5320                 /* If softirq window is exhausted then punt.
5321                  * Allow this to run for 2 jiffies since which will allow
5322                  * an average latency of 1.5/HZ.
5323                  */
5324                 if (unlikely(budget <= 0 ||
5325                              time_after_eq(jiffies, time_limit))) {
5326                         sd->time_squeeze++;
5327                         break;
5328                 }
5329         }
5330
5331         __kfree_skb_flush();
5332         local_irq_disable();
5333
5334         list_splice_tail_init(&sd->poll_list, &list);
5335         list_splice_tail(&repoll, &list);
5336         list_splice(&list, &sd->poll_list);
5337         if (!list_empty(&sd->poll_list))
5338                 __raise_softirq_irqoff(NET_RX_SOFTIRQ);
5339
5340         net_rps_action_and_irq_enable(sd);
5341 }
5342
5343 struct netdev_adjacent {
5344         struct net_device *dev;
5345
5346         /* upper master flag, there can only be one master device per list */
5347         bool master;
5348
5349         /* counter for the number of times this device was added to us */
5350         u16 ref_nr;
5351
5352         /* private field for the users */
5353         void *private;
5354
5355         struct list_head list;
5356         struct rcu_head rcu;
5357 };
5358
5359 static struct netdev_adjacent *__netdev_find_adj(struct net_device *adj_dev,
5360                                                  struct list_head *adj_list)
5361 {
5362         struct netdev_adjacent *adj;
5363
5364         list_for_each_entry(adj, adj_list, list) {
5365                 if (adj->dev == adj_dev)
5366                         return adj;
5367         }
5368         return NULL;
5369 }
5370
5371 /**
5372  * netdev_has_upper_dev - Check if device is linked to an upper device
5373  * @dev: device
5374  * @upper_dev: upper device to check
5375  *
5376  * Find out if a device is linked to specified upper device and return true
5377  * in case it is. Note that this checks only immediate upper device,
5378  * not through a complete stack of devices. The caller must hold the RTNL lock.
5379  */
5380 bool netdev_has_upper_dev(struct net_device *dev,
5381                           struct net_device *upper_dev)
5382 {
5383         ASSERT_RTNL();
5384
5385         return __netdev_find_adj(upper_dev, &dev->all_adj_list.upper);
5386 }
5387 EXPORT_SYMBOL(netdev_has_upper_dev);
5388
5389 /**
5390  * netdev_has_any_upper_dev - Check if device is linked to some device
5391  * @dev: device
5392  *
5393  * Find out if a device is linked to an upper device and return true in case
5394  * it is. The caller must hold the RTNL lock.
5395  */
5396 bool netdev_has_any_upper_dev(struct net_device *dev)
5397 {
5398         ASSERT_RTNL();
5399
5400         return !list_empty(&dev->all_adj_list.upper);
5401 }
5402 EXPORT_SYMBOL(netdev_has_any_upper_dev);
5403
5404 /**
5405  * netdev_master_upper_dev_get - Get master upper device
5406  * @dev: device
5407  *
5408  * Find a master upper device and return pointer to it or NULL in case
5409  * it's not there. The caller must hold the RTNL lock.
5410  */
5411 struct net_device *netdev_master_upper_dev_get(struct net_device *dev)
5412 {
5413         struct netdev_adjacent *upper;
5414
5415         ASSERT_RTNL();
5416
5417         if (list_empty(&dev->adj_list.upper))
5418                 return NULL;
5419
5420         upper = list_first_entry(&dev->adj_list.upper,
5421                                  struct netdev_adjacent, list);
5422         if (likely(upper->master))
5423                 return upper->dev;
5424         return NULL;
5425 }
5426 EXPORT_SYMBOL(netdev_master_upper_dev_get);
5427
5428 void *netdev_adjacent_get_private(struct list_head *adj_list)
5429 {
5430         struct netdev_adjacent *adj;
5431
5432         adj = list_entry(adj_list, struct netdev_adjacent, list);
5433
5434         return adj->private;
5435 }
5436 EXPORT_SYMBOL(netdev_adjacent_get_private);
5437
5438 /**
5439  * netdev_upper_get_next_dev_rcu - Get the next dev from upper list
5440  * @dev: device
5441  * @iter: list_head ** of the current position
5442  *
5443  * Gets the next device from the dev's upper list, starting from iter
5444  * position. The caller must hold RCU read lock.
5445  */
5446 struct net_device *netdev_upper_get_next_dev_rcu(struct net_device *dev,
5447                                                  struct list_head **iter)
5448 {
5449         struct netdev_adjacent *upper;
5450
5451         WARN_ON_ONCE(!rcu_read_lock_held() && !lockdep_rtnl_is_held());
5452
5453         upper = list_entry_rcu((*iter)->next, struct netdev_adjacent, list);
5454
5455         if (&upper->list == &dev->adj_list.upper)
5456                 return NULL;
5457
5458         *iter = &upper->list;
5459
5460         return upper->dev;
5461 }
5462 EXPORT_SYMBOL(netdev_upper_get_next_dev_rcu);
5463
5464 /**
5465  * netdev_all_upper_get_next_dev_rcu - Get the next dev from upper list
5466  * @dev: device
5467  * @iter: list_head ** of the current position
5468  *
5469  * Gets the next device from the dev's upper list, starting from iter
5470  * position. The caller must hold RCU read lock.
5471  */
5472 struct net_device *netdev_all_upper_get_next_dev_rcu(struct net_device *dev,
5473                                                      struct list_head **iter)
5474 {
5475         struct netdev_adjacent *upper;
5476
5477         WARN_ON_ONCE(!rcu_read_lock_held() && !lockdep_rtnl_is_held());
5478
5479         upper = list_entry_rcu((*iter)->next, struct netdev_adjacent, list);
5480
5481         if (&upper->list == &dev->all_adj_list.upper)
5482                 return NULL;
5483
5484         *iter = &upper->list;
5485
5486         return upper->dev;
5487 }
5488 EXPORT_SYMBOL(netdev_all_upper_get_next_dev_rcu);
5489
5490 /**
5491  * netdev_lower_get_next_private - Get the next ->private from the
5492  *                                 lower neighbour list
5493  * @dev: device
5494  * @iter: list_head ** of the current position
5495  *
5496  * Gets the next netdev_adjacent->private from the dev's lower neighbour
5497  * list, starting from iter position. The caller must hold either hold the
5498  * RTNL lock or its own locking that guarantees that the neighbour lower
5499  * list will remain unchanged.
5500  */
5501 void *netdev_lower_get_next_private(struct net_device *dev,
5502                                     struct list_head **iter)
5503 {
5504         struct netdev_adjacent *lower;
5505
5506         lower = list_entry(*iter, struct netdev_adjacent, list);
5507
5508         if (&lower->list == &dev->adj_list.lower)
5509                 return NULL;
5510
5511         *iter = lower->list.next;
5512
5513         return lower->private;
5514 }
5515 EXPORT_SYMBOL(netdev_lower_get_next_private);
5516
5517 /**
5518  * netdev_lower_get_next_private_rcu - Get the next ->private from the
5519  *                                     lower neighbour list, RCU
5520  *                                     variant
5521  * @dev: device
5522  * @iter: list_head ** of the current position
5523  *
5524  * Gets the next netdev_adjacent->private from the dev's lower neighbour
5525  * list, starting from iter position. The caller must hold RCU read lock.
5526  */
5527 void *netdev_lower_get_next_private_rcu(struct net_device *dev,
5528                                         struct list_head **iter)
5529 {
5530         struct netdev_adjacent *lower;
5531
5532         WARN_ON_ONCE(!rcu_read_lock_held());
5533
5534         lower = list_entry_rcu((*iter)->next, struct netdev_adjacent, list);
5535
5536         if (&lower->list == &dev->adj_list.lower)
5537                 return NULL;
5538
5539         *iter = &lower->list;
5540
5541         return lower->private;
5542 }
5543 EXPORT_SYMBOL(netdev_lower_get_next_private_rcu);
5544
5545 /**
5546  * netdev_lower_get_next - Get the next device from the lower neighbour
5547  *                         list
5548  * @dev: device
5549  * @iter: list_head ** of the current position
5550  *
5551  * Gets the next netdev_adjacent from the dev's lower neighbour
5552  * list, starting from iter position. The caller must hold RTNL lock or
5553  * its own locking that guarantees that the neighbour lower
5554  * list will remain unchanged.
5555  */
5556 void *netdev_lower_get_next(struct net_device *dev, struct list_head **iter)
5557 {
5558         struct netdev_adjacent *lower;
5559
5560         lower = list_entry(*iter, struct netdev_adjacent, list);
5561
5562         if (&lower->list == &dev->adj_list.lower)
5563                 return NULL;
5564
5565         *iter = lower->list.next;
5566
5567         return lower->dev;
5568 }
5569 EXPORT_SYMBOL(netdev_lower_get_next);
5570
5571 /**
5572  * netdev_all_lower_get_next - Get the next device from all lower neighbour list
5573  * @dev: device
5574  * @iter: list_head ** of the current position
5575  *
5576  * Gets the next netdev_adjacent from the dev's all lower neighbour
5577  * list, starting from iter position. The caller must hold RTNL lock or
5578  * its own locking that guarantees that the neighbour all lower
5579  * list will remain unchanged.
5580  */
5581 struct net_device *netdev_all_lower_get_next(struct net_device *dev, struct list_head **iter)
5582 {
5583         struct netdev_adjacent *lower;
5584
5585         lower = list_entry(*iter, struct netdev_adjacent, list);
5586
5587         if (&lower->list == &dev->all_adj_list.lower)
5588                 return NULL;
5589
5590         *iter = lower->list.next;
5591
5592         return lower->dev;
5593 }
5594 EXPORT_SYMBOL(netdev_all_lower_get_next);
5595
5596 /**
5597  * netdev_all_lower_get_next_rcu - Get the next device from all
5598  *                                 lower neighbour list, RCU variant
5599  * @dev: device
5600  * @iter: list_head ** of the current position
5601  *
5602  * Gets the next netdev_adjacent from the dev's all lower neighbour
5603  * list, starting from iter position. The caller must hold RCU read lock.
5604  */
5605 struct net_device *netdev_all_lower_get_next_rcu(struct net_device *dev,
5606                                                  struct list_head **iter)
5607 {
5608         struct netdev_adjacent *lower;
5609
5610         lower = list_entry_rcu((*iter)->next, struct netdev_adjacent, list);
5611
5612         if (&lower->list == &dev->all_adj_list.lower)
5613                 return NULL;
5614
5615         *iter = &lower->list;
5616
5617         return lower->dev;
5618 }
5619 EXPORT_SYMBOL(netdev_all_lower_get_next_rcu);
5620
5621 /**
5622  * netdev_lower_get_first_private_rcu - Get the first ->private from the
5623  *                                     lower neighbour list, RCU
5624  *                                     variant
5625  * @dev: device
5626  *
5627  * Gets the first netdev_adjacent->private from the dev's lower neighbour
5628  * list. The caller must hold RCU read lock.
5629  */
5630 void *netdev_lower_get_first_private_rcu(struct net_device *dev)
5631 {
5632         struct netdev_adjacent *lower;
5633
5634         lower = list_first_or_null_rcu(&dev->adj_list.lower,
5635                         struct netdev_adjacent, list);
5636         if (lower)
5637                 return lower->private;
5638         return NULL;
5639 }
5640 EXPORT_SYMBOL(netdev_lower_get_first_private_rcu);
5641
5642 /**
5643  * netdev_master_upper_dev_get_rcu - Get master upper device
5644  * @dev: device
5645  *
5646  * Find a master upper device and return pointer to it or NULL in case
5647  * it's not there. The caller must hold the RCU read lock.
5648  */
5649 struct net_device *netdev_master_upper_dev_get_rcu(struct net_device *dev)
5650 {
5651         struct netdev_adjacent *upper;
5652
5653         upper = list_first_or_null_rcu(&dev->adj_list.upper,
5654                                        struct netdev_adjacent, list);
5655         if (upper && likely(upper->master))
5656                 return upper->dev;
5657         return NULL;
5658 }
5659 EXPORT_SYMBOL(netdev_master_upper_dev_get_rcu);
5660
5661 static int netdev_adjacent_sysfs_add(struct net_device *dev,
5662                               struct net_device *adj_dev,
5663                               struct list_head *dev_list)
5664 {
5665         char linkname[IFNAMSIZ+7];
5666         sprintf(linkname, dev_list == &dev->adj_list.upper ?
5667                 "upper_%s" : "lower_%s", adj_dev->name);
5668         return sysfs_create_link(&(dev->dev.kobj), &(adj_dev->dev.kobj),
5669                                  linkname);
5670 }
5671 static void netdev_adjacent_sysfs_del(struct net_device *dev,
5672                                char *name,
5673                                struct list_head *dev_list)
5674 {
5675         char linkname[IFNAMSIZ+7];
5676         sprintf(linkname, dev_list == &dev->adj_list.upper ?
5677                 "upper_%s" : "lower_%s", name);
5678         sysfs_remove_link(&(dev->dev.kobj), linkname);
5679 }
5680
5681 static inline bool netdev_adjacent_is_neigh_list(struct net_device *dev,
5682                                                  struct net_device *adj_dev,
5683                                                  struct list_head *dev_list)
5684 {
5685         return (dev_list == &dev->adj_list.upper ||
5686                 dev_list == &dev->adj_list.lower) &&
5687                 net_eq(dev_net(dev), dev_net(adj_dev));
5688 }
5689
5690 static int __netdev_adjacent_dev_insert(struct net_device *dev,
5691                                         struct net_device *adj_dev,
5692                                         u16 ref_nr,
5693                                         struct list_head *dev_list,
5694                                         void *private, bool master)
5695 {
5696         struct netdev_adjacent *adj;
5697         int ret;
5698
5699         adj = __netdev_find_adj(adj_dev, dev_list);
5700
5701         if (adj) {
5702                 adj->ref_nr += ref_nr;
5703                 return 0;
5704         }
5705
5706         adj = kmalloc(sizeof(*adj), GFP_KERNEL);
5707         if (!adj)
5708                 return -ENOMEM;
5709
5710         adj->dev = adj_dev;
5711         adj->master = master;
5712         adj->ref_nr = ref_nr;
5713         adj->private = private;
5714         dev_hold(adj_dev);
5715
5716         pr_debug("dev_hold for %s, because of link added from %s to %s\n",
5717                  adj_dev->name, dev->name, adj_dev->name);
5718
5719         if (netdev_adjacent_is_neigh_list(dev, adj_dev, dev_list)) {
5720                 ret = netdev_adjacent_sysfs_add(dev, adj_dev, dev_list);
5721                 if (ret)
5722                         goto free_adj;
5723         }
5724
5725         /* Ensure that master link is always the first item in list. */
5726         if (master) {
5727                 ret = sysfs_create_link(&(dev->dev.kobj),
5728                                         &(adj_dev->dev.kobj), "master");
5729                 if (ret)
5730                         goto remove_symlinks;
5731
5732                 list_add_rcu(&adj->list, dev_list);
5733         } else {
5734                 list_add_tail_rcu(&adj->list, dev_list);
5735         }
5736
5737         return 0;
5738
5739 remove_symlinks:
5740         if (netdev_adjacent_is_neigh_list(dev, adj_dev, dev_list))
5741                 netdev_adjacent_sysfs_del(dev, adj_dev->name, dev_list);
5742 free_adj:
5743         kfree(adj);
5744         dev_put(adj_dev);
5745
5746         return ret;
5747 }
5748
5749 static void __netdev_adjacent_dev_remove(struct net_device *dev,
5750                                          struct net_device *adj_dev,
5751                                          u16 ref_nr,
5752                                          struct list_head *dev_list)
5753 {
5754         struct netdev_adjacent *adj;
5755
5756         adj = __netdev_find_adj(adj_dev, dev_list);
5757
5758         if (!adj) {
5759                 pr_err("tried to remove device %s from %s\n",
5760                        dev->name, adj_dev->name);
5761                 BUG();
5762         }
5763
5764         if (adj->ref_nr > ref_nr) {
5765                 pr_debug("%s to %s ref_nr-%d = %d\n", dev->name, adj_dev->name,
5766                          ref_nr, adj->ref_nr-ref_nr);
5767                 adj->ref_nr -= ref_nr;
5768                 return;
5769         }
5770
5771         if (adj->master)
5772                 sysfs_remove_link(&(dev->dev.kobj), "master");
5773
5774         if (netdev_adjacent_is_neigh_list(dev, adj_dev, dev_list))
5775                 netdev_adjacent_sysfs_del(dev, adj_dev->name, dev_list);
5776
5777         list_del_rcu(&adj->list);
5778         pr_debug("dev_put for %s, because link removed from %s to %s\n",
5779                  adj_dev->name, dev->name, adj_dev->name);
5780         dev_put(adj_dev);
5781         kfree_rcu(adj, rcu);
5782 }
5783
5784 static int __netdev_adjacent_dev_link_lists(struct net_device *dev,
5785                                             struct net_device *upper_dev,
5786                                             u16 ref_nr,
5787                                             struct list_head *up_list,
5788                                             struct list_head *down_list,
5789                                             void *private, bool master)
5790 {
5791         int ret;
5792
5793         ret = __netdev_adjacent_dev_insert(dev, upper_dev, ref_nr, up_list,
5794                                            private, master);
5795         if (ret)
5796                 return ret;
5797
5798         ret = __netdev_adjacent_dev_insert(upper_dev, dev, ref_nr, down_list,
5799                                            private, false);
5800         if (ret) {
5801                 __netdev_adjacent_dev_remove(dev, upper_dev, ref_nr, up_list);
5802                 return ret;
5803         }
5804
5805         return 0;
5806 }
5807
5808 static int __netdev_adjacent_dev_link(struct net_device *dev,
5809                                       struct net_device *upper_dev,
5810                                       u16 ref_nr)
5811 {
5812         return __netdev_adjacent_dev_link_lists(dev, upper_dev, ref_nr,
5813                                                 &dev->all_adj_list.upper,
5814                                                 &upper_dev->all_adj_list.lower,
5815                                                 NULL, false);
5816 }
5817
5818 static void __netdev_adjacent_dev_unlink_lists(struct net_device *dev,
5819                                                struct net_device *upper_dev,
5820                                                u16 ref_nr,
5821                                                struct list_head *up_list,
5822                                                struct list_head *down_list)
5823 {
5824         __netdev_adjacent_dev_remove(dev, upper_dev, ref_nr, up_list);
5825         __netdev_adjacent_dev_remove(upper_dev, dev, ref_nr, down_list);
5826 }
5827
5828 static void __netdev_adjacent_dev_unlink(struct net_device *dev,
5829                                          struct net_device *upper_dev,
5830                                          u16 ref_nr)
5831 {
5832         __netdev_adjacent_dev_unlink_lists(dev, upper_dev, ref_nr,
5833                                            &dev->all_adj_list.upper,
5834                                            &upper_dev->all_adj_list.lower);
5835 }
5836
5837 static int __netdev_adjacent_dev_link_neighbour(struct net_device *dev,
5838                                                 struct net_device *upper_dev,
5839                                                 void *private, bool master)
5840 {
5841         int ret = __netdev_adjacent_dev_link(dev, upper_dev, 1);
5842
5843         if (ret)
5844                 return ret;
5845
5846         ret = __netdev_adjacent_dev_link_lists(dev, upper_dev, 1,
5847                                                &dev->adj_list.upper,
5848                                                &upper_dev->adj_list.lower,
5849                                                private, master);
5850         if (ret) {
5851                 __netdev_adjacent_dev_unlink(dev, upper_dev, 1);
5852                 return ret;
5853         }
5854
5855         return 0;
5856 }
5857
5858 static void __netdev_adjacent_dev_unlink_neighbour(struct net_device *dev,
5859                                                    struct net_device *upper_dev)
5860 {
5861         __netdev_adjacent_dev_unlink(dev, upper_dev, 1);
5862         __netdev_adjacent_dev_unlink_lists(dev, upper_dev, 1,
5863                                            &dev->adj_list.upper,
5864                                            &upper_dev->adj_list.lower);
5865 }
5866
5867 static int __netdev_upper_dev_link(struct net_device *dev,
5868                                    struct net_device *upper_dev, bool master,
5869                                    void *upper_priv, void *upper_info)
5870 {
5871         struct netdev_notifier_changeupper_info changeupper_info;
5872         struct netdev_adjacent *i, *j, *to_i, *to_j;
5873         int ret = 0;
5874
5875         ASSERT_RTNL();
5876
5877         if (dev == upper_dev)
5878                 return -EBUSY;
5879
5880         /* To prevent loops, check if dev is not upper device to upper_dev. */
5881         if (__netdev_find_adj(dev, &upper_dev->all_adj_list.upper))
5882                 return -EBUSY;
5883
5884         if (__netdev_find_adj(upper_dev, &dev->adj_list.upper))
5885                 return -EEXIST;
5886
5887         if (master && netdev_master_upper_dev_get(dev))
5888                 return -EBUSY;
5889
5890         changeupper_info.upper_dev = upper_dev;
5891         changeupper_info.master = master;
5892         changeupper_info.linking = true;
5893         changeupper_info.upper_info = upper_info;
5894
5895         ret = call_netdevice_notifiers_info(NETDEV_PRECHANGEUPPER, dev,
5896                                             &changeupper_info.info);
5897         ret = notifier_to_errno(ret);
5898         if (ret)
5899                 return ret;
5900
5901         ret = __netdev_adjacent_dev_link_neighbour(dev, upper_dev, upper_priv,
5902                                                    master);
5903         if (ret)
5904                 return ret;
5905
5906         /* Now that we linked these devs, make all the upper_dev's
5907          * all_adj_list.upper visible to every dev's all_adj_list.lower an
5908          * versa, and don't forget the devices itself. All of these
5909          * links are non-neighbours.
5910          */
5911         list_for_each_entry(i, &dev->all_adj_list.lower, list) {
5912                 list_for_each_entry(j, &upper_dev->all_adj_list.upper, list) {
5913                         pr_debug("Interlinking %s with %s, non-neighbour\n",
5914                                  i->dev->name, j->dev->name);
5915                         ret = __netdev_adjacent_dev_link(i->dev, j->dev, i->ref_nr);
5916                         if (ret)
5917                                 goto rollback_mesh;
5918                 }
5919         }
5920
5921         /* add dev to every upper_dev's upper device */
5922         list_for_each_entry(i, &upper_dev->all_adj_list.upper, list) {
5923                 pr_debug("linking %s's upper device %s with %s\n",
5924                          upper_dev->name, i->dev->name, dev->name);
5925                 ret = __netdev_adjacent_dev_link(dev, i->dev, i->ref_nr);
5926                 if (ret)
5927                         goto rollback_upper_mesh;
5928         }
5929
5930         /* add upper_dev to every dev's lower device */
5931         list_for_each_entry(i, &dev->all_adj_list.lower, list) {
5932                 pr_debug("linking %s's lower device %s with %s\n", dev->name,
5933                          i->dev->name, upper_dev->name);
5934                 ret = __netdev_adjacent_dev_link(i->dev, upper_dev, i->ref_nr);
5935                 if (ret)
5936                         goto rollback_lower_mesh;
5937         }
5938
5939         ret = call_netdevice_notifiers_info(NETDEV_CHANGEUPPER, dev,
5940                                             &changeupper_info.info);
5941         ret = notifier_to_errno(ret);
5942         if (ret)
5943                 goto rollback_lower_mesh;
5944
5945         return 0;
5946
5947 rollback_lower_mesh:
5948         to_i = i;
5949         list_for_each_entry(i, &dev->all_adj_list.lower, list) {
5950                 if (i == to_i)
5951                         break;
5952                 __netdev_adjacent_dev_unlink(i->dev, upper_dev, i->ref_nr);
5953         }
5954
5955         i = NULL;
5956
5957 rollback_upper_mesh:
5958         to_i = i;
5959         list_for_each_entry(i, &upper_dev->all_adj_list.upper, list) {
5960                 if (i == to_i)
5961                         break;
5962                 __netdev_adjacent_dev_unlink(dev, i->dev, i->ref_nr);
5963         }
5964
5965         i = j = NULL;
5966
5967 rollback_mesh:
5968         to_i = i;
5969         to_j = j;
5970         list_for_each_entry(i, &dev->all_adj_list.lower, list) {
5971                 list_for_each_entry(j, &upper_dev->all_adj_list.upper, list) {
5972                         if (i == to_i && j == to_j)
5973                                 break;
5974                         __netdev_adjacent_dev_unlink(i->dev, j->dev, i->ref_nr);
5975                 }
5976                 if (i == to_i)
5977                         break;
5978         }
5979
5980         __netdev_adjacent_dev_unlink_neighbour(dev, upper_dev);
5981
5982         return ret;
5983 }
5984
5985 /**
5986  * netdev_upper_dev_link - Add a link to the upper device
5987  * @dev: device
5988  * @upper_dev: new upper device
5989  *
5990  * Adds a link to device which is upper to this one. The caller must hold
5991  * the RTNL lock. On a failure a negative errno code is returned.
5992  * On success the reference counts are adjusted and the function
5993  * returns zero.
5994  */
5995 int netdev_upper_dev_link(struct net_device *dev,
5996                           struct net_device *upper_dev)
5997 {
5998         return __netdev_upper_dev_link(dev, upper_dev, false, NULL, NULL);
5999 }
6000 EXPORT_SYMBOL(netdev_upper_dev_link);
6001
6002 /**
6003  * netdev_master_upper_dev_link - Add a master link to the upper device
6004  * @dev: device
6005  * @upper_dev: new upper device
6006  * @upper_priv: upper device private
6007  * @upper_info: upper info to be passed down via notifier
6008  *
6009  * Adds a link to device which is upper to this one. In this case, only
6010  * one master upper device can be linked, although other non-master devices
6011  * might be linked as well. The caller must hold the RTNL lock.
6012  * On a failure a negative errno code is returned. On success the reference
6013  * counts are adjusted and the function returns zero.
6014  */
6015 int netdev_master_upper_dev_link(struct net_device *dev,
6016                                  struct net_device *upper_dev,
6017                                  void *upper_priv, void *upper_info)
6018 {
6019         return __netdev_upper_dev_link(dev, upper_dev, true,
6020                                        upper_priv, upper_info);
6021 }
6022 EXPORT_SYMBOL(netdev_master_upper_dev_link);
6023
6024 /**
6025  * netdev_upper_dev_unlink - Removes a link to upper device
6026  * @dev: device
6027  * @upper_dev: new upper device
6028  *
6029  * Removes a link to device which is upper to this one. The caller must hold
6030  * the RTNL lock.
6031  */
6032 void netdev_upper_dev_unlink(struct net_device *dev,
6033                              struct net_device *upper_dev)
6034 {
6035         struct netdev_notifier_changeupper_info changeupper_info;
6036         struct netdev_adjacent *i, *j;
6037         ASSERT_RTNL();
6038
6039         changeupper_info.upper_dev = upper_dev;
6040         changeupper_info.master = netdev_master_upper_dev_get(dev) == upper_dev;
6041         changeupper_info.linking = false;
6042
6043         call_netdevice_notifiers_info(NETDEV_PRECHANGEUPPER, dev,
6044                                       &changeupper_info.info);
6045
6046         __netdev_adjacent_dev_unlink_neighbour(dev, upper_dev);
6047
6048         /* Here is the tricky part. We must remove all dev's lower
6049          * devices from all upper_dev's upper devices and vice
6050          * versa, to maintain the graph relationship.
6051          */
6052         list_for_each_entry(i, &dev->all_adj_list.lower, list)
6053                 list_for_each_entry(j, &upper_dev->all_adj_list.upper, list)
6054                         __netdev_adjacent_dev_unlink(i->dev, j->dev, i->ref_nr);
6055
6056         /* remove also the devices itself from lower/upper device
6057          * list
6058          */
6059         list_for_each_entry(i, &dev->all_adj_list.lower, list)
6060                 __netdev_adjacent_dev_unlink(i->dev, upper_dev, i->ref_nr);
6061
6062         list_for_each_entry(i, &upper_dev->all_adj_list.upper, list)
6063                 __netdev_adjacent_dev_unlink(dev, i->dev, i->ref_nr);
6064
6065         call_netdevice_notifiers_info(NETDEV_CHANGEUPPER, dev,
6066                                       &changeupper_info.info);
6067 }
6068 EXPORT_SYMBOL(netdev_upper_dev_unlink);
6069
6070 /**
6071  * netdev_bonding_info_change - Dispatch event about slave change
6072  * @dev: device
6073  * @bonding_info: info to dispatch
6074  *
6075  * Send NETDEV_BONDING_INFO to netdev notifiers with info.
6076  * The caller must hold the RTNL lock.
6077  */
6078 void netdev_bonding_info_change(struct net_device *dev,
6079                                 struct netdev_bonding_info *bonding_info)
6080 {
6081         struct netdev_notifier_bonding_info     info;
6082
6083         memcpy(&info.bonding_info, bonding_info,
6084                sizeof(struct netdev_bonding_info));
6085         call_netdevice_notifiers_info(NETDEV_BONDING_INFO, dev,
6086                                       &info.info);
6087 }
6088 EXPORT_SYMBOL(netdev_bonding_info_change);
6089
6090 static void netdev_adjacent_add_links(struct net_device *dev)
6091 {
6092         struct netdev_adjacent *iter;
6093
6094         struct net *net = dev_net(dev);
6095
6096         list_for_each_entry(iter, &dev->adj_list.upper, list) {
6097                 if (!net_eq(net, dev_net(iter->dev)))
6098                         continue;
6099                 netdev_adjacent_sysfs_add(iter->dev, dev,
6100                                           &iter->dev->adj_list.lower);
6101                 netdev_adjacent_sysfs_add(dev, iter->dev,
6102                                           &dev->adj_list.upper);
6103         }
6104
6105         list_for_each_entry(iter, &dev->adj_list.lower, list) {
6106                 if (!net_eq(net, dev_net(iter->dev)))
6107                         continue;
6108                 netdev_adjacent_sysfs_add(iter->dev, dev,
6109                                           &iter->dev->adj_list.upper);
6110                 netdev_adjacent_sysfs_add(dev, iter->dev,
6111                                           &dev->adj_list.lower);
6112         }
6113 }
6114
6115 static void netdev_adjacent_del_links(struct net_device *dev)
6116 {
6117         struct netdev_adjacent *iter;
6118
6119         struct net *net = dev_net(dev);
6120
6121         list_for_each_entry(iter, &dev->adj_list.upper, list) {
6122                 if (!net_eq(net, dev_net(iter->dev)))
6123                         continue;
6124                 netdev_adjacent_sysfs_del(iter->dev, dev->name,
6125                                           &iter->dev->adj_list.lower);
6126                 netdev_adjacent_sysfs_del(dev, iter->dev->name,
6127                                           &dev->adj_list.upper);
6128         }
6129
6130         list_for_each_entry(iter, &dev->adj_list.lower, list) {
6131                 if (!net_eq(net, dev_net(iter->dev)))
6132                         continue;
6133                 netdev_adjacent_sysfs_del(iter->dev, dev->name,
6134                                           &iter->dev->adj_list.upper);
6135                 netdev_adjacent_sysfs_del(dev, iter->dev->name,
6136                                           &dev->adj_list.lower);
6137         }
6138 }
6139
6140 void netdev_adjacent_rename_links(struct net_device *dev, char *oldname)
6141 {
6142         struct netdev_adjacent *iter;
6143
6144         struct net *net = dev_net(dev);
6145
6146         list_for_each_entry(iter, &dev->adj_list.upper, list) {
6147                 if (!net_eq(net, dev_net(iter->dev)))
6148                         continue;
6149                 netdev_adjacent_sysfs_del(iter->dev, oldname,
6150                                           &iter->dev->adj_list.lower);
6151                 netdev_adjacent_sysfs_add(iter->dev, dev,
6152                                           &iter->dev->adj_list.lower);
6153         }
6154
6155         list_for_each_entry(iter, &dev->adj_list.lower, list) {
6156                 if (!net_eq(net, dev_net(iter->dev)))
6157                         continue;
6158                 netdev_adjacent_sysfs_del(iter->dev, oldname,
6159                                           &iter->dev->adj_list.upper);
6160                 netdev_adjacent_sysfs_add(iter->dev, dev,
6161                                           &iter->dev->adj_list.upper);
6162         }
6163 }
6164
6165 void *netdev_lower_dev_get_private(struct net_device *dev,
6166                                    struct net_device *lower_dev)
6167 {
6168         struct netdev_adjacent *lower;
6169
6170         if (!lower_dev)
6171                 return NULL;
6172         lower = __netdev_find_adj(lower_dev, &dev->adj_list.lower);
6173         if (!lower)
6174                 return NULL;
6175
6176         return lower->private;
6177 }
6178 EXPORT_SYMBOL(netdev_lower_dev_get_private);
6179
6180
6181 int dev_get_nest_level(struct net_device *dev)
6182 {
6183         struct net_device *lower = NULL;
6184         struct list_head *iter;
6185         int max_nest = -1;
6186         int nest;
6187
6188         ASSERT_RTNL();
6189
6190         netdev_for_each_lower_dev(dev, lower, iter) {
6191                 nest = dev_get_nest_level(lower);
6192                 if (max_nest < nest)
6193                         max_nest = nest;
6194         }
6195
6196         return max_nest + 1;
6197 }
6198 EXPORT_SYMBOL(dev_get_nest_level);
6199
6200 /**
6201  * netdev_lower_change - Dispatch event about lower device state change
6202  * @lower_dev: device
6203  * @lower_state_info: state to dispatch
6204  *
6205  * Send NETDEV_CHANGELOWERSTATE to netdev notifiers with info.
6206  * The caller must hold the RTNL lock.
6207  */
6208 void netdev_lower_state_changed(struct net_device *lower_dev,
6209                                 void *lower_state_info)
6210 {
6211         struct netdev_notifier_changelowerstate_info changelowerstate_info;
6212
6213         ASSERT_RTNL();
6214         changelowerstate_info.lower_state_info = lower_state_info;
6215         call_netdevice_notifiers_info(NETDEV_CHANGELOWERSTATE, lower_dev,
6216                                       &changelowerstate_info.info);
6217 }
6218 EXPORT_SYMBOL(netdev_lower_state_changed);
6219
6220 int netdev_default_l2upper_neigh_construct(struct net_device *dev,
6221                                            struct neighbour *n)
6222 {
6223         struct net_device *lower_dev, *stop_dev;
6224         struct list_head *iter;
6225         int err;
6226
6227         netdev_for_each_lower_dev(dev, lower_dev, iter) {
6228                 if (!lower_dev->netdev_ops->ndo_neigh_construct)
6229                         continue;
6230                 err = lower_dev->netdev_ops->ndo_neigh_construct(lower_dev, n);
6231                 if (err) {
6232                         stop_dev = lower_dev;
6233                         goto rollback;
6234                 }
6235         }
6236         return 0;
6237
6238 rollback:
6239         netdev_for_each_lower_dev(dev, lower_dev, iter) {
6240                 if (lower_dev == stop_dev)
6241                         break;
6242                 if (!lower_dev->netdev_ops->ndo_neigh_destroy)
6243                         continue;
6244                 lower_dev->netdev_ops->ndo_neigh_destroy(lower_dev, n);
6245         }
6246         return err;
6247 }
6248 EXPORT_SYMBOL_GPL(netdev_default_l2upper_neigh_construct);
6249
6250 void netdev_default_l2upper_neigh_destroy(struct net_device *dev,
6251                                           struct neighbour *n)
6252 {
6253         struct net_device *lower_dev;
6254         struct list_head *iter;
6255
6256         netdev_for_each_lower_dev(dev, lower_dev, iter) {
6257                 if (!lower_dev->netdev_ops->ndo_neigh_destroy)
6258                         continue;
6259                 lower_dev->netdev_ops->ndo_neigh_destroy(lower_dev, n);
6260         }
6261 }
6262 EXPORT_SYMBOL_GPL(netdev_default_l2upper_neigh_destroy);
6263
6264 static void dev_change_rx_flags(struct net_device *dev, int flags)
6265 {
6266         const struct net_device_ops *ops = dev->netdev_ops;
6267
6268         if (ops->ndo_change_rx_flags)
6269                 ops->ndo_change_rx_flags(dev, flags);
6270 }
6271
6272 static int __dev_set_promiscuity(struct net_device *dev, int inc, bool notify)
6273 {
6274         unsigned int old_flags = dev->flags;
6275         kuid_t uid;
6276         kgid_t gid;
6277
6278         ASSERT_RTNL();
6279
6280         dev->flags |= IFF_PROMISC;
6281         dev->promiscuity += inc;
6282         if (dev->promiscuity == 0) {
6283                 /*
6284                  * Avoid overflow.
6285                  * If inc causes overflow, untouch promisc and return error.
6286                  */
6287                 if (inc < 0)
6288                         dev->flags &= ~IFF_PROMISC;
6289                 else {
6290                         dev->promiscuity -= inc;
6291                         pr_warn("%s: promiscuity touches roof, set promiscuity failed. promiscuity feature of device might be broken.\n",
6292                                 dev->name);
6293                         return -EOVERFLOW;
6294                 }
6295         }
6296         if (dev->flags != old_flags) {
6297                 pr_info("device %s %s promiscuous mode\n",
6298                         dev->name,
6299                         dev->flags & IFF_PROMISC ? "entered" : "left");
6300                 if (audit_enabled) {
6301                         current_uid_gid(&uid, &gid);
6302                         audit_log(current->audit_context, GFP_ATOMIC,
6303                                 AUDIT_ANOM_PROMISCUOUS,
6304                                 "dev=%s prom=%d old_prom=%d auid=%u uid=%u gid=%u ses=%u",
6305                                 dev->name, (dev->flags & IFF_PROMISC),
6306                                 (old_flags & IFF_PROMISC),
6307                                 from_kuid(&init_user_ns, audit_get_loginuid(current)),
6308                                 from_kuid(&init_user_ns, uid),
6309                                 from_kgid(&init_user_ns, gid),
6310                                 audit_get_sessionid(current));
6311                 }
6312
6313                 dev_change_rx_flags(dev, IFF_PROMISC);
6314         }
6315         if (notify)
6316                 __dev_notify_flags(dev, old_flags, IFF_PROMISC);
6317         return 0;
6318 }
6319
6320 /**
6321  *      dev_set_promiscuity     - update promiscuity count on a device
6322  *      @dev: device
6323  *      @inc: modifier
6324  *
6325  *      Add or remove promiscuity from a device. While the count in the device
6326  *      remains above zero the interface remains promiscuous. Once it hits zero
6327  *      the device reverts back to normal filtering operation. A negative inc
6328  *      value is used to drop promiscuity on the device.
6329  *      Return 0 if successful or a negative errno code on error.
6330  */
6331 int dev_set_promiscuity(struct net_device *dev, int inc)
6332 {
6333         unsigned int old_flags = dev->flags;
6334         int err;
6335
6336         err = __dev_set_promiscuity(dev, inc, true);
6337         if (err < 0)
6338                 return err;
6339         if (dev->flags != old_flags)
6340                 dev_set_rx_mode(dev);
6341         return err;
6342 }
6343 EXPORT_SYMBOL(dev_set_promiscuity);
6344
6345 static int __dev_set_allmulti(struct net_device *dev, int inc, bool notify)
6346 {
6347         unsigned int old_flags = dev->flags, old_gflags = dev->gflags;
6348
6349         ASSERT_RTNL();
6350
6351         dev->flags |= IFF_ALLMULTI;
6352         dev->allmulti += inc;
6353         if (dev->allmulti == 0) {
6354                 /*
6355                  * Avoid overflow.
6356                  * If inc causes overflow, untouch allmulti and return error.
6357                  */
6358                 if (inc < 0)
6359                         dev->flags &= ~IFF_ALLMULTI;
6360                 else {
6361                         dev->allmulti -= inc;
6362                         pr_warn("%s: allmulti touches roof, set allmulti failed. allmulti feature of device might be broken.\n",
6363                                 dev->name);
6364                         return -EOVERFLOW;
6365                 }
6366         }
6367         if (dev->flags ^ old_flags) {
6368                 dev_change_rx_flags(dev, IFF_ALLMULTI);
6369                 dev_set_rx_mode(dev);
6370                 if (notify)
6371                         __dev_notify_flags(dev, old_flags,
6372                                            dev->gflags ^ old_gflags);
6373         }
6374         return 0;
6375 }
6376
6377 /**
6378  *      dev_set_allmulti        - update allmulti count on a device
6379  *      @dev: device
6380  *      @inc: modifier
6381  *
6382  *      Add or remove reception of all multicast frames to a device. While the
6383  *      count in the device remains above zero the interface remains listening
6384  *      to all interfaces. Once it hits zero the device reverts back to normal
6385  *      filtering operation. A negative @inc value is used to drop the counter
6386  *      when releasing a resource needing all multicasts.
6387  *      Return 0 if successful or a negative errno code on error.
6388  */
6389
6390 int dev_set_allmulti(struct net_device *dev, int inc)
6391 {
6392         return __dev_set_allmulti(dev, inc, true);
6393 }
6394 EXPORT_SYMBOL(dev_set_allmulti);
6395
6396 /*
6397  *      Upload unicast and multicast address lists to device and
6398  *      configure RX filtering. When the device doesn't support unicast
6399  *      filtering it is put in promiscuous mode while unicast addresses
6400  *      are present.
6401  */
6402 void __dev_set_rx_mode(struct net_device *dev)
6403 {
6404         const struct net_device_ops *ops = dev->netdev_ops;
6405
6406         /* dev_open will call this function so the list will stay sane. */
6407         if (!(dev->flags&IFF_UP))
6408                 return;
6409
6410         if (!netif_device_present(dev))
6411                 return;
6412
6413         if (!(dev->priv_flags & IFF_UNICAST_FLT)) {
6414                 /* Unicast addresses changes may only happen under the rtnl,
6415                  * therefore calling __dev_set_promiscuity here is safe.
6416                  */
6417                 if (!netdev_uc_empty(dev) && !dev->uc_promisc) {
6418                         __dev_set_promiscuity(dev, 1, false);
6419                         dev->uc_promisc = true;
6420                 } else if (netdev_uc_empty(dev) && dev->uc_promisc) {
6421                         __dev_set_promiscuity(dev, -1, false);
6422                         dev->uc_promisc = false;
6423                 }
6424         }
6425
6426         if (ops->ndo_set_rx_mode)
6427                 ops->ndo_set_rx_mode(dev);
6428 }
6429
6430 void dev_set_rx_mode(struct net_device *dev)
6431 {
6432         netif_addr_lock_bh(dev);
6433         __dev_set_rx_mode(dev);
6434         netif_addr_unlock_bh(dev);
6435 }
6436
6437 /**
6438  *      dev_get_flags - get flags reported to userspace
6439  *      @dev: device
6440  *
6441  *      Get the combination of flag bits exported through APIs to userspace.
6442  */
6443 unsigned int dev_get_flags(const struct net_device *dev)
6444 {
6445         unsigned int flags;
6446
6447         flags = (dev->flags & ~(IFF_PROMISC |
6448                                 IFF_ALLMULTI |
6449                                 IFF_RUNNING |
6450                                 IFF_LOWER_UP |
6451                                 IFF_DORMANT)) |
6452                 (dev->gflags & (IFF_PROMISC |
6453                                 IFF_ALLMULTI));
6454
6455         if (netif_running(dev)) {
6456                 if (netif_oper_up(dev))
6457                         flags |= IFF_RUNNING;
6458                 if (netif_carrier_ok(dev))
6459                         flags |= IFF_LOWER_UP;
6460                 if (netif_dormant(dev))
6461                         flags |= IFF_DORMANT;
6462         }
6463
6464         return flags;
6465 }
6466 EXPORT_SYMBOL(dev_get_flags);
6467
6468 int __dev_change_flags(struct net_device *dev, unsigned int flags)
6469 {
6470         unsigned int old_flags = dev->flags;
6471         int ret;
6472
6473         ASSERT_RTNL();
6474
6475         /*
6476          *      Set the flags on our device.
6477          */
6478
6479         dev->flags = (flags & (IFF_DEBUG | IFF_NOTRAILERS | IFF_NOARP |
6480                                IFF_DYNAMIC | IFF_MULTICAST | IFF_PORTSEL |
6481                                IFF_AUTOMEDIA)) |
6482                      (dev->flags & (IFF_UP | IFF_VOLATILE | IFF_PROMISC |
6483                                     IFF_ALLMULTI));
6484
6485         /*
6486          *      Load in the correct multicast list now the flags have changed.
6487          */
6488
6489         if ((old_flags ^ flags) & IFF_MULTICAST)
6490                 dev_change_rx_flags(dev, IFF_MULTICAST);
6491
6492         dev_set_rx_mode(dev);
6493
6494         /*
6495          *      Have we downed the interface. We handle IFF_UP ourselves
6496          *      according to user attempts to set it, rather than blindly
6497          *      setting it.
6498          */
6499
6500         ret = 0;
6501         if ((old_flags ^ flags) & IFF_UP)
6502                 ret = ((old_flags & IFF_UP) ? __dev_close : __dev_open)(dev);
6503
6504         if ((flags ^ dev->gflags) & IFF_PROMISC) {
6505                 int inc = (flags & IFF_PROMISC) ? 1 : -1;
6506                 unsigned int old_flags = dev->flags;
6507
6508                 dev->gflags ^= IFF_PROMISC;
6509
6510                 if (__dev_set_promiscuity(dev, inc, false) >= 0)
6511                         if (dev->flags != old_flags)
6512                                 dev_set_rx_mode(dev);
6513         }
6514
6515         /* NOTE: order of synchronization of IFF_PROMISC and IFF_ALLMULTI
6516            is important. Some (broken) drivers set IFF_PROMISC, when
6517            IFF_ALLMULTI is requested not asking us and not reporting.
6518          */
6519         if ((flags ^ dev->gflags) & IFF_ALLMULTI) {
6520                 int inc = (flags & IFF_ALLMULTI) ? 1 : -1;
6521
6522                 dev->gflags ^= IFF_ALLMULTI;
6523                 __dev_set_allmulti(dev, inc, false);
6524         }
6525
6526         return ret;
6527 }
6528
6529 void __dev_notify_flags(struct net_device *dev, unsigned int old_flags,
6530                         unsigned int gchanges)
6531 {
6532         unsigned int changes = dev->flags ^ old_flags;
6533
6534         if (gchanges)
6535                 rtmsg_ifinfo(RTM_NEWLINK, dev, gchanges, GFP_ATOMIC);
6536
6537         if (changes & IFF_UP) {
6538                 if (dev->flags & IFF_UP)
6539                         call_netdevice_notifiers(NETDEV_UP, dev);
6540                 else
6541                         call_netdevice_notifiers(NETDEV_DOWN, dev);
6542         }
6543
6544         if (dev->flags & IFF_UP &&
6545             (changes & ~(IFF_UP | IFF_PROMISC | IFF_ALLMULTI | IFF_VOLATILE))) {
6546                 struct netdev_notifier_change_info change_info;
6547
6548                 change_info.flags_changed = changes;
6549                 call_netdevice_notifiers_info(NETDEV_CHANGE, dev,
6550                                               &change_info.info);
6551         }
6552 }
6553
6554 /**
6555  *      dev_change_flags - change device settings
6556  *      @dev: device
6557  *      @flags: device state flags
6558  *
6559  *      Change settings on device based state flags. The flags are
6560  *      in the userspace exported format.
6561  */
6562 int dev_change_flags(struct net_device *dev, unsigned int flags)
6563 {
6564         int ret;
6565         unsigned int changes, old_flags = dev->flags, old_gflags = dev->gflags;
6566
6567         ret = __dev_change_flags(dev, flags);
6568         if (ret < 0)
6569                 return ret;
6570
6571         changes = (old_flags ^ dev->flags) | (old_gflags ^ dev->gflags);
6572         __dev_notify_flags(dev, old_flags, changes);
6573         return ret;
6574 }
6575 EXPORT_SYMBOL(dev_change_flags);
6576
6577 static int __dev_set_mtu(struct net_device *dev, int new_mtu)
6578 {
6579         const struct net_device_ops *ops = dev->netdev_ops;
6580
6581         if (ops->ndo_change_mtu)
6582                 return ops->ndo_change_mtu(dev, new_mtu);
6583
6584         dev->mtu = new_mtu;
6585         return 0;
6586 }
6587
6588 /**
6589  *      dev_set_mtu - Change maximum transfer unit
6590  *      @dev: device
6591  *      @new_mtu: new transfer unit
6592  *
6593  *      Change the maximum transfer size of the network device.
6594  */
6595 int dev_set_mtu(struct net_device *dev, int new_mtu)
6596 {
6597         int err, orig_mtu;
6598
6599         if (new_mtu == dev->mtu)
6600                 return 0;
6601
6602         /*      MTU must be positive.    */
6603         if (new_mtu < 0)
6604                 return -EINVAL;
6605
6606         if (!netif_device_present(dev))
6607                 return -ENODEV;
6608
6609         err = call_netdevice_notifiers(NETDEV_PRECHANGEMTU, dev);
6610         err = notifier_to_errno(err);
6611         if (err)
6612                 return err;
6613
6614         orig_mtu = dev->mtu;
6615         err = __dev_set_mtu(dev, new_mtu);
6616
6617         if (!err) {
6618                 err = call_netdevice_notifiers_mtu(NETDEV_CHANGEMTU, dev,
6619                                                    orig_mtu);
6620                 err = notifier_to_errno(err);
6621                 if (err) {
6622                         /* setting mtu back and notifying everyone again,
6623                          * so that they have a chance to revert changes.
6624                          */
6625                         __dev_set_mtu(dev, orig_mtu);
6626                         call_netdevice_notifiers_mtu(NETDEV_CHANGEMTU, dev,
6627                                                      new_mtu);
6628                 }
6629         }
6630         return err;
6631 }
6632 EXPORT_SYMBOL(dev_set_mtu);
6633
6634 /**
6635  *      dev_set_group - Change group this device belongs to
6636  *      @dev: device
6637  *      @new_group: group this device should belong to
6638  */
6639 void dev_set_group(struct net_device *dev, int new_group)
6640 {
6641         dev->group = new_group;
6642 }
6643 EXPORT_SYMBOL(dev_set_group);
6644
6645 /**
6646  *      dev_set_mac_address - Change Media Access Control Address
6647  *      @dev: device
6648  *      @sa: new address
6649  *
6650  *      Change the hardware (MAC) address of the device
6651  */
6652 int dev_set_mac_address(struct net_device *dev, struct sockaddr *sa)
6653 {
6654         const struct net_device_ops *ops = dev->netdev_ops;
6655         int err;
6656
6657         if (!ops->ndo_set_mac_address)
6658                 return -EOPNOTSUPP;
6659         if (sa->sa_family != dev->type)
6660                 return -EINVAL;
6661         if (!netif_device_present(dev))
6662                 return -ENODEV;
6663         err = ops->ndo_set_mac_address(dev, sa);
6664         if (err)
6665                 return err;
6666         dev->addr_assign_type = NET_ADDR_SET;
6667         call_netdevice_notifiers(NETDEV_CHANGEADDR, dev);
6668         add_device_randomness(dev->dev_addr, dev->addr_len);
6669         return 0;
6670 }
6671 EXPORT_SYMBOL(dev_set_mac_address);
6672
6673 /**
6674  *      dev_change_carrier - Change device carrier
6675  *      @dev: device
6676  *      @new_carrier: new value
6677  *
6678  *      Change device carrier
6679  */
6680 int dev_change_carrier(struct net_device *dev, bool new_carrier)
6681 {
6682         const struct net_device_ops *ops = dev->netdev_ops;
6683
6684         if (!ops->ndo_change_carrier)
6685                 return -EOPNOTSUPP;
6686         if (!netif_device_present(dev))
6687                 return -ENODEV;
6688         return ops->ndo_change_carrier(dev, new_carrier);
6689 }
6690 EXPORT_SYMBOL(dev_change_carrier);
6691
6692 /**
6693  *      dev_get_phys_port_id - Get device physical port ID
6694  *      @dev: device
6695  *      @ppid: port ID
6696  *
6697  *      Get device physical port ID
6698  */
6699 int dev_get_phys_port_id(struct net_device *dev,
6700                          struct netdev_phys_item_id *ppid)
6701 {
6702         const struct net_device_ops *ops = dev->netdev_ops;
6703
6704         if (!ops->ndo_get_phys_port_id)
6705                 return -EOPNOTSUPP;
6706         return ops->ndo_get_phys_port_id(dev, ppid);
6707 }
6708 EXPORT_SYMBOL(dev_get_phys_port_id);
6709
6710 /**
6711  *      dev_get_phys_port_name - Get device physical port name
6712  *      @dev: device
6713  *      @name: port name
6714  *      @len: limit of bytes to copy to name
6715  *
6716  *      Get device physical port name
6717  */
6718 int dev_get_phys_port_name(struct net_device *dev,
6719                            char *name, size_t len)
6720 {
6721         const struct net_device_ops *ops = dev->netdev_ops;
6722
6723         if (!ops->ndo_get_phys_port_name)
6724                 return -EOPNOTSUPP;
6725         return ops->ndo_get_phys_port_name(dev, name, len);
6726 }
6727 EXPORT_SYMBOL(dev_get_phys_port_name);
6728
6729 /**
6730  *      dev_change_proto_down - update protocol port state information
6731  *      @dev: device
6732  *      @proto_down: new value
6733  *
6734  *      This info can be used by switch drivers to set the phys state of the
6735  *      port.
6736  */
6737 int dev_change_proto_down(struct net_device *dev, bool proto_down)
6738 {
6739         const struct net_device_ops *ops = dev->netdev_ops;
6740
6741         if (!ops->ndo_change_proto_down)
6742                 return -EOPNOTSUPP;
6743         if (!netif_device_present(dev))
6744                 return -ENODEV;
6745         return ops->ndo_change_proto_down(dev, proto_down);
6746 }
6747 EXPORT_SYMBOL(dev_change_proto_down);
6748
6749 /**
6750  *      dev_change_xdp_fd - set or clear a bpf program for a device rx path
6751  *      @dev: device
6752  *      @fd: new program fd or negative value to clear
6753  *
6754  *      Set or clear a bpf program for a device
6755  */
6756 int dev_change_xdp_fd(struct net_device *dev, int fd)
6757 {
6758         const struct net_device_ops *ops = dev->netdev_ops;
6759         struct bpf_prog *prog = NULL;
6760         struct netdev_xdp xdp = {};
6761         int err;
6762
6763         if (!ops->ndo_xdp)
6764                 return -EOPNOTSUPP;
6765         if (fd >= 0) {
6766                 prog = bpf_prog_get_type(fd, BPF_PROG_TYPE_XDP);
6767                 if (IS_ERR(prog))
6768                         return PTR_ERR(prog);
6769         }
6770
6771         xdp.command = XDP_SETUP_PROG;
6772         xdp.prog = prog;
6773         err = ops->ndo_xdp(dev, &xdp);
6774         if (err < 0 && prog)
6775                 bpf_prog_put(prog);
6776
6777         return err;
6778 }
6779 EXPORT_SYMBOL(dev_change_xdp_fd);
6780
6781 /**
6782  *      dev_new_index   -       allocate an ifindex
6783  *      @net: the applicable net namespace
6784  *
6785  *      Returns a suitable unique value for a new device interface
6786  *      number.  The caller must hold the rtnl semaphore or the
6787  *      dev_base_lock to be sure it remains unique.
6788  */
6789 static int dev_new_index(struct net *net)
6790 {
6791         int ifindex = net->ifindex;
6792         for (;;) {
6793                 if (++ifindex <= 0)
6794                         ifindex = 1;
6795                 if (!__dev_get_by_index(net, ifindex))
6796                         return net->ifindex = ifindex;
6797         }
6798 }
6799
6800 /* Delayed registration/unregisteration */
6801 static LIST_HEAD(net_todo_list);
6802 DECLARE_WAIT_QUEUE_HEAD(netdev_unregistering_wq);
6803
6804 static void net_set_todo(struct net_device *dev)
6805 {
6806         list_add_tail(&dev->todo_list, &net_todo_list);
6807         dev_net(dev)->dev_unreg_count++;
6808 }
6809
6810 static void rollback_registered_many(struct list_head *head)
6811 {
6812         struct net_device *dev, *tmp;
6813         LIST_HEAD(close_head);
6814
6815         BUG_ON(dev_boot_phase);
6816         ASSERT_RTNL();
6817
6818         list_for_each_entry_safe(dev, tmp, head, unreg_list) {
6819                 /* Some devices call without registering
6820                  * for initialization unwind. Remove those
6821                  * devices and proceed with the remaining.
6822                  */
6823                 if (dev->reg_state == NETREG_UNINITIALIZED) {
6824                         pr_debug("unregister_netdevice: device %s/%p never was registered\n",
6825                                  dev->name, dev);
6826
6827                         WARN_ON(1);
6828                         list_del(&dev->unreg_list);
6829                         continue;
6830                 }
6831                 dev->dismantle = true;
6832                 BUG_ON(dev->reg_state != NETREG_REGISTERED);
6833         }
6834
6835         /* If device is running, close it first. */
6836         list_for_each_entry(dev, head, unreg_list)
6837                 list_add_tail(&dev->close_list, &close_head);
6838         dev_close_many(&close_head, true);
6839
6840         list_for_each_entry(dev, head, unreg_list) {
6841                 /* And unlink it from device chain. */
6842                 unlist_netdevice(dev);
6843
6844                 dev->reg_state = NETREG_UNREGISTERING;
6845         }
6846         flush_all_backlogs();
6847
6848         synchronize_net();
6849
6850         list_for_each_entry(dev, head, unreg_list) {
6851                 struct sk_buff *skb = NULL;
6852
6853                 /* Shutdown queueing discipline. */
6854                 dev_shutdown(dev);
6855
6856
6857                 /* Notify protocols, that we are about to destroy
6858                    this device. They should clean all the things.
6859                 */
6860                 call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
6861
6862                 if (!dev->rtnl_link_ops ||
6863                     dev->rtnl_link_state == RTNL_LINK_INITIALIZED)
6864                         skb = rtmsg_ifinfo_build_skb(RTM_DELLINK, dev, ~0U,
6865                                                      GFP_KERNEL);
6866
6867                 /*
6868                  *      Flush the unicast and multicast chains
6869                  */
6870                 dev_uc_flush(dev);
6871                 dev_mc_flush(dev);
6872
6873                 if (dev->netdev_ops->ndo_uninit)
6874                         dev->netdev_ops->ndo_uninit(dev);
6875
6876                 if (skb)
6877                         rtmsg_ifinfo_send(skb, dev, GFP_KERNEL);
6878
6879                 /* Notifier chain MUST detach us all upper devices. */
6880                 WARN_ON(netdev_has_any_upper_dev(dev));
6881
6882                 /* Remove entries from kobject tree */
6883                 netdev_unregister_kobject(dev);
6884 #ifdef CONFIG_XPS
6885                 /* Remove XPS queueing entries */
6886                 netif_reset_xps_queues_gt(dev, 0);
6887 #endif
6888         }
6889
6890         synchronize_net();
6891
6892         list_for_each_entry(dev, head, unreg_list)
6893                 dev_put(dev);
6894 }
6895
6896 static void rollback_registered(struct net_device *dev)
6897 {
6898         LIST_HEAD(single);
6899
6900         list_add(&dev->unreg_list, &single);
6901         rollback_registered_many(&single);
6902         list_del(&single);
6903 }
6904
6905 static netdev_features_t netdev_sync_upper_features(struct net_device *lower,
6906         struct net_device *upper, netdev_features_t features)
6907 {
6908         netdev_features_t upper_disables = NETIF_F_UPPER_DISABLES;
6909         netdev_features_t feature;
6910         int feature_bit;
6911
6912         for_each_netdev_feature(&upper_disables, feature_bit) {
6913                 feature = __NETIF_F_BIT(feature_bit);
6914                 if (!(upper->wanted_features & feature)
6915                     && (features & feature)) {
6916                         netdev_dbg(lower, "Dropping feature %pNF, upper dev %s has it off.\n",
6917                                    &feature, upper->name);
6918                         features &= ~feature;
6919                 }
6920         }
6921
6922         return features;
6923 }
6924
6925 static void netdev_sync_lower_features(struct net_device *upper,
6926         struct net_device *lower, netdev_features_t features)
6927 {
6928         netdev_features_t upper_disables = NETIF_F_UPPER_DISABLES;
6929         netdev_features_t feature;
6930         int feature_bit;
6931
6932         for_each_netdev_feature(&upper_disables, feature_bit) {
6933                 feature = __NETIF_F_BIT(feature_bit);
6934                 if (!(features & feature) && (lower->features & feature)) {
6935                         netdev_dbg(upper, "Disabling feature %pNF on lower dev %s.\n",
6936                                    &feature, lower->name);
6937                         lower->wanted_features &= ~feature;
6938                         netdev_update_features(lower);
6939
6940                         if (unlikely(lower->features & feature))
6941                                 netdev_WARN(upper, "failed to disable %pNF on %s!\n",
6942                                             &feature, lower->name);
6943                 }
6944         }
6945 }
6946
6947 static netdev_features_t netdev_fix_features(struct net_device *dev,
6948         netdev_features_t features)
6949 {
6950         /* Fix illegal checksum combinations */
6951         if ((features & NETIF_F_HW_CSUM) &&
6952             (features & (NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))) {
6953                 netdev_warn(dev, "mixed HW and IP checksum settings.\n");
6954                 features &= ~(NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM);
6955         }
6956
6957         /* TSO requires that SG is present as well. */
6958         if ((features & NETIF_F_ALL_TSO) && !(features & NETIF_F_SG)) {
6959                 netdev_dbg(dev, "Dropping TSO features since no SG feature.\n");
6960                 features &= ~NETIF_F_ALL_TSO;
6961         }
6962
6963         if ((features & NETIF_F_TSO) && !(features & NETIF_F_HW_CSUM) &&
6964                                         !(features & NETIF_F_IP_CSUM)) {
6965                 netdev_dbg(dev, "Dropping TSO features since no CSUM feature.\n");
6966                 features &= ~NETIF_F_TSO;
6967                 features &= ~NETIF_F_TSO_ECN;
6968         }
6969
6970         if ((features & NETIF_F_TSO6) && !(features & NETIF_F_HW_CSUM) &&
6971                                          !(features & NETIF_F_IPV6_CSUM)) {
6972                 netdev_dbg(dev, "Dropping TSO6 features since no CSUM feature.\n");
6973                 features &= ~NETIF_F_TSO6;
6974         }
6975
6976         /* TSO with IPv4 ID mangling requires IPv4 TSO be enabled */
6977         if ((features & NETIF_F_TSO_MANGLEID) && !(features & NETIF_F_TSO))
6978                 features &= ~NETIF_F_TSO_MANGLEID;
6979
6980         /* TSO ECN requires that TSO is present as well. */
6981         if ((features & NETIF_F_ALL_TSO) == NETIF_F_TSO_ECN)
6982                 features &= ~NETIF_F_TSO_ECN;
6983
6984         /* Software GSO depends on SG. */
6985         if ((features & NETIF_F_GSO) && !(features & NETIF_F_SG)) {
6986                 netdev_dbg(dev, "Dropping NETIF_F_GSO since no SG feature.\n");
6987                 features &= ~NETIF_F_GSO;
6988         }
6989
6990         /* UFO needs SG and checksumming */
6991         if (features & NETIF_F_UFO) {
6992                 /* maybe split UFO into V4 and V6? */
6993                 if (!(features & NETIF_F_HW_CSUM) &&
6994                     ((features & (NETIF_F_IP_CSUM | NETIF_F_IPV6_CSUM)) !=
6995                      (NETIF_F_IP_CSUM | NETIF_F_IPV6_CSUM))) {
6996                         netdev_dbg(dev,
6997                                 "Dropping NETIF_F_UFO since no checksum offload features.\n");
6998                         features &= ~NETIF_F_UFO;
6999                 }
7000
7001                 if (!(features & NETIF_F_SG)) {
7002                         netdev_dbg(dev,
7003                                 "Dropping NETIF_F_UFO since no NETIF_F_SG feature.\n");
7004                         features &= ~NETIF_F_UFO;
7005                 }
7006         }
7007
7008         /* GSO partial features require GSO partial be set */
7009         if ((features & dev->gso_partial_features) &&
7010             !(features & NETIF_F_GSO_PARTIAL)) {
7011                 netdev_dbg(dev,
7012                            "Dropping partially supported GSO features since no GSO partial.\n");
7013                 features &= ~dev->gso_partial_features;
7014         }
7015
7016 #ifdef CONFIG_NET_RX_BUSY_POLL
7017         if (dev->netdev_ops->ndo_busy_poll)
7018                 features |= NETIF_F_BUSY_POLL;
7019         else
7020 #endif
7021                 features &= ~NETIF_F_BUSY_POLL;
7022
7023         return features;
7024 }
7025
7026 int __netdev_update_features(struct net_device *dev)
7027 {
7028         struct net_device *upper, *lower;
7029         netdev_features_t features;
7030         struct list_head *iter;
7031         int err = -1;
7032
7033         ASSERT_RTNL();
7034
7035         features = netdev_get_wanted_features(dev);
7036
7037         if (dev->netdev_ops->ndo_fix_features)
7038                 features = dev->netdev_ops->ndo_fix_features(dev, features);
7039
7040         /* driver might be less strict about feature dependencies */
7041         features = netdev_fix_features(dev, features);
7042
7043         /* some features can't be enabled if they're off an an upper device */
7044         netdev_for_each_upper_dev_rcu(dev, upper, iter)
7045                 features = netdev_sync_upper_features(dev, upper, features);
7046
7047         if (dev->features == features)
7048                 goto sync_lower;
7049
7050         netdev_dbg(dev, "Features changed: %pNF -> %pNF\n",
7051                 &dev->features, &features);
7052
7053         if (dev->netdev_ops->ndo_set_features)
7054                 err = dev->netdev_ops->ndo_set_features(dev, features);
7055         else
7056                 err = 0;
7057
7058         if (unlikely(err < 0)) {
7059                 netdev_err(dev,
7060                         "set_features() failed (%d); wanted %pNF, left %pNF\n",
7061                         err, &features, &dev->features);
7062                 /* return non-0 since some features might have changed and
7063                  * it's better to fire a spurious notification than miss it
7064                  */
7065                 return -1;
7066         }
7067
7068 sync_lower:
7069         /* some features must be disabled on lower devices when disabled
7070          * on an upper device (think: bonding master or bridge)
7071          */
7072         netdev_for_each_lower_dev(dev, lower, iter)
7073                 netdev_sync_lower_features(dev, lower, features);
7074
7075         if (!err)
7076                 dev->features = features;
7077
7078         return err < 0 ? 0 : 1;
7079 }
7080
7081 /**
7082  *      netdev_update_features - recalculate device features
7083  *      @dev: the device to check
7084  *
7085  *      Recalculate dev->features set and send notifications if it
7086  *      has changed. Should be called after driver or hardware dependent
7087  *      conditions might have changed that influence the features.
7088  */
7089 void netdev_update_features(struct net_device *dev)
7090 {
7091         if (__netdev_update_features(dev))
7092                 netdev_features_change(dev);
7093 }
7094 EXPORT_SYMBOL(netdev_update_features);
7095
7096 /**
7097  *      netdev_change_features - recalculate device features
7098  *      @dev: the device to check
7099  *
7100  *      Recalculate dev->features set and send notifications even
7101  *      if they have not changed. Should be called instead of
7102  *      netdev_update_features() if also dev->vlan_features might
7103  *      have changed to allow the changes to be propagated to stacked
7104  *      VLAN devices.
7105  */
7106 void netdev_change_features(struct net_device *dev)
7107 {
7108         __netdev_update_features(dev);
7109         netdev_features_change(dev);
7110 }
7111 EXPORT_SYMBOL(netdev_change_features);
7112
7113 /**
7114  *      netif_stacked_transfer_operstate -      transfer operstate
7115  *      @rootdev: the root or lower level device to transfer state from
7116  *      @dev: the device to transfer operstate to
7117  *
7118  *      Transfer operational state from root to device. This is normally
7119  *      called when a stacking relationship exists between the root
7120  *      device and the device(a leaf device).
7121  */
7122 void netif_stacked_transfer_operstate(const struct net_device *rootdev,
7123                                         struct net_device *dev)
7124 {
7125         if (rootdev->operstate == IF_OPER_DORMANT)
7126                 netif_dormant_on(dev);
7127         else
7128                 netif_dormant_off(dev);
7129
7130         if (netif_carrier_ok(rootdev)) {
7131                 if (!netif_carrier_ok(dev))
7132                         netif_carrier_on(dev);
7133         } else {
7134                 if (netif_carrier_ok(dev))
7135                         netif_carrier_off(dev);
7136         }
7137 }
7138 EXPORT_SYMBOL(netif_stacked_transfer_operstate);
7139
7140 #ifdef CONFIG_SYSFS
7141 static int netif_alloc_rx_queues(struct net_device *dev)
7142 {
7143         unsigned int i, count = dev->num_rx_queues;
7144         struct netdev_rx_queue *rx;
7145         size_t sz = count * sizeof(*rx);
7146
7147         BUG_ON(count < 1);
7148
7149         rx = kzalloc(sz, GFP_KERNEL | __GFP_NOWARN | __GFP_REPEAT);
7150         if (!rx) {
7151                 rx = vzalloc(sz);
7152                 if (!rx)
7153                         return -ENOMEM;
7154         }
7155         dev->_rx = rx;
7156
7157         for (i = 0; i < count; i++)
7158                 rx[i].dev = dev;
7159         return 0;
7160 }
7161 #endif
7162
7163 static void netdev_init_one_queue(struct net_device *dev,
7164                                   struct netdev_queue *queue, void *_unused)
7165 {
7166         /* Initialize queue lock */
7167         spin_lock_init(&queue->_xmit_lock);
7168         netdev_set_xmit_lockdep_class(&queue->_xmit_lock, dev->type);
7169         queue->xmit_lock_owner = -1;
7170         netdev_queue_numa_node_write(queue, NUMA_NO_NODE);
7171         queue->dev = dev;
7172 #ifdef CONFIG_BQL
7173         dql_init(&queue->dql, HZ);
7174 #endif
7175 }
7176
7177 static void netif_free_tx_queues(struct net_device *dev)
7178 {
7179         kvfree(dev->_tx);
7180 }
7181
7182 static int netif_alloc_netdev_queues(struct net_device *dev)
7183 {
7184         unsigned int count = dev->num_tx_queues;
7185         struct netdev_queue *tx;
7186         size_t sz = count * sizeof(*tx);
7187
7188         if (count < 1 || count > 0xffff)
7189                 return -EINVAL;
7190
7191         tx = kzalloc(sz, GFP_KERNEL | __GFP_NOWARN | __GFP_REPEAT);
7192         if (!tx) {
7193                 tx = vzalloc(sz);
7194                 if (!tx)
7195                         return -ENOMEM;
7196         }
7197         dev->_tx = tx;
7198
7199         netdev_for_each_tx_queue(dev, netdev_init_one_queue, NULL);
7200         spin_lock_init(&dev->tx_global_lock);
7201
7202         return 0;
7203 }
7204
7205 void netif_tx_stop_all_queues(struct net_device *dev)
7206 {
7207         unsigned int i;
7208
7209         for (i = 0; i < dev->num_tx_queues; i++) {
7210                 struct netdev_queue *txq = netdev_get_tx_queue(dev, i);
7211                 netif_tx_stop_queue(txq);
7212         }
7213 }
7214 EXPORT_SYMBOL(netif_tx_stop_all_queues);
7215
7216 /**
7217  *      register_netdevice      - register a network device
7218  *      @dev: device to register
7219  *
7220  *      Take a completed network device structure and add it to the kernel
7221  *      interfaces. A %NETDEV_REGISTER message is sent to the netdev notifier
7222  *      chain. 0 is returned on success. A negative errno code is returned
7223  *      on a failure to set up the device, or if the name is a duplicate.
7224  *
7225  *      Callers must hold the rtnl semaphore. You may want
7226  *      register_netdev() instead of this.
7227  *
7228  *      BUGS:
7229  *      The locking appears insufficient to guarantee two parallel registers
7230  *      will not get the same name.
7231  */
7232
7233 int register_netdevice(struct net_device *dev)
7234 {
7235         int ret;
7236         struct net *net = dev_net(dev);
7237
7238         BUG_ON(dev_boot_phase);
7239         ASSERT_RTNL();
7240
7241         might_sleep();
7242
7243         /* When net_device's are persistent, this will be fatal. */
7244         BUG_ON(dev->reg_state != NETREG_UNINITIALIZED);
7245         BUG_ON(!net);
7246
7247         spin_lock_init(&dev->addr_list_lock);
7248         netdev_set_addr_lockdep_class(dev);
7249
7250         ret = dev_get_valid_name(net, dev, dev->name);
7251         if (ret < 0)
7252                 goto out;
7253
7254         /* Init, if this function is available */
7255         if (dev->netdev_ops->ndo_init) {
7256                 ret = dev->netdev_ops->ndo_init(dev);
7257                 if (ret) {
7258                         if (ret > 0)
7259                                 ret = -EIO;
7260                         goto out;
7261                 }
7262         }
7263
7264         if (((dev->hw_features | dev->features) &
7265              NETIF_F_HW_VLAN_CTAG_FILTER) &&
7266             (!dev->netdev_ops->ndo_vlan_rx_add_vid ||
7267              !dev->netdev_ops->ndo_vlan_rx_kill_vid)) {
7268                 netdev_WARN(dev, "Buggy VLAN acceleration in driver!\n");
7269                 ret = -EINVAL;
7270                 goto err_uninit;
7271         }
7272
7273         ret = -EBUSY;
7274         if (!dev->ifindex)
7275                 dev->ifindex = dev_new_index(net);
7276         else if (__dev_get_by_index(net, dev->ifindex))
7277                 goto err_uninit;
7278
7279         /* Transfer changeable features to wanted_features and enable
7280          * software offloads (GSO and GRO).
7281          */
7282         dev->hw_features |= NETIF_F_SOFT_FEATURES;
7283         dev->features |= NETIF_F_SOFT_FEATURES;
7284         dev->wanted_features = dev->features & dev->hw_features;
7285
7286         if (!(dev->flags & IFF_LOOPBACK))
7287                 dev->hw_features |= NETIF_F_NOCACHE_COPY;
7288
7289         /* If IPv4 TCP segmentation offload is supported we should also
7290          * allow the device to enable segmenting the frame with the option
7291          * of ignoring a static IP ID value.  This doesn't enable the
7292          * feature itself but allows the user to enable it later.
7293          */
7294         if (dev->hw_features & NETIF_F_TSO)
7295                 dev->hw_features |= NETIF_F_TSO_MANGLEID;
7296         if (dev->vlan_features & NETIF_F_TSO)
7297                 dev->vlan_features |= NETIF_F_TSO_MANGLEID;
7298         if (dev->mpls_features & NETIF_F_TSO)
7299                 dev->mpls_features |= NETIF_F_TSO_MANGLEID;
7300         if (dev->hw_enc_features & NETIF_F_TSO)
7301                 dev->hw_enc_features |= NETIF_F_TSO_MANGLEID;
7302
7303         /* Make NETIF_F_HIGHDMA inheritable to VLAN devices.
7304          */
7305         dev->vlan_features |= NETIF_F_HIGHDMA;
7306
7307         /* Make NETIF_F_SG inheritable to tunnel devices.
7308          */
7309         dev->hw_enc_features |= NETIF_F_SG | NETIF_F_GSO_PARTIAL;
7310
7311         /* Make NETIF_F_SG inheritable to MPLS.
7312          */
7313         dev->mpls_features |= NETIF_F_SG;
7314
7315         ret = call_netdevice_notifiers(NETDEV_POST_INIT, dev);
7316         ret = notifier_to_errno(ret);
7317         if (ret)
7318                 goto err_uninit;
7319
7320         ret = netdev_register_kobject(dev);
7321         if (ret)
7322                 goto err_uninit;
7323         dev->reg_state = NETREG_REGISTERED;
7324
7325         __netdev_update_features(dev);
7326
7327         /*
7328          *      Default initial state at registry is that the
7329          *      device is present.
7330          */
7331
7332         set_bit(__LINK_STATE_PRESENT, &dev->state);
7333
7334         linkwatch_init_dev(dev);
7335
7336         dev_init_scheduler(dev);
7337         dev_hold(dev);
7338         list_netdevice(dev);
7339         add_device_randomness(dev->dev_addr, dev->addr_len);
7340
7341         /* If the device has permanent device address, driver should
7342          * set dev_addr and also addr_assign_type should be set to
7343          * NET_ADDR_PERM (default value).
7344          */
7345         if (dev->addr_assign_type == NET_ADDR_PERM)
7346                 memcpy(dev->perm_addr, dev->dev_addr, dev->addr_len);
7347
7348         /* Notify protocols, that a new device appeared. */
7349         ret = call_netdevice_notifiers(NETDEV_REGISTER, dev);
7350         ret = notifier_to_errno(ret);
7351         if (ret) {
7352                 rollback_registered(dev);
7353                 dev->reg_state = NETREG_UNREGISTERED;
7354         }
7355         /*
7356          *      Prevent userspace races by waiting until the network
7357          *      device is fully setup before sending notifications.
7358          */
7359         if (!dev->rtnl_link_ops ||
7360             dev->rtnl_link_state == RTNL_LINK_INITIALIZED)
7361                 rtmsg_ifinfo(RTM_NEWLINK, dev, ~0U, GFP_KERNEL);
7362
7363 out:
7364         return ret;
7365
7366 err_uninit:
7367         if (dev->netdev_ops->ndo_uninit)
7368                 dev->netdev_ops->ndo_uninit(dev);
7369         goto out;
7370 }
7371 EXPORT_SYMBOL(register_netdevice);
7372
7373 /**
7374  *      init_dummy_netdev       - init a dummy network device for NAPI
7375  *      @dev: device to init
7376  *
7377  *      This takes a network device structure and initialize the minimum
7378  *      amount of fields so it can be used to schedule NAPI polls without
7379  *      registering a full blown interface. This is to be used by drivers
7380  *      that need to tie several hardware interfaces to a single NAPI
7381  *      poll scheduler due to HW limitations.
7382  */
7383 int init_dummy_netdev(struct net_device *dev)
7384 {
7385         /* Clear everything. Note we don't initialize spinlocks
7386          * are they aren't supposed to be taken by any of the
7387          * NAPI code and this dummy netdev is supposed to be
7388          * only ever used for NAPI polls
7389          */
7390         memset(dev, 0, sizeof(struct net_device));
7391
7392         /* make sure we BUG if trying to hit standard
7393          * register/unregister code path
7394          */
7395         dev->reg_state = NETREG_DUMMY;
7396
7397         /* NAPI wants this */
7398         INIT_LIST_HEAD(&dev->napi_list);
7399
7400         /* a dummy interface is started by default */
7401         set_bit(__LINK_STATE_PRESENT, &dev->state);
7402         set_bit(__LINK_STATE_START, &dev->state);
7403
7404         /* Note : We dont allocate pcpu_refcnt for dummy devices,
7405          * because users of this 'device' dont need to change
7406          * its refcount.
7407          */
7408
7409         return 0;
7410 }
7411 EXPORT_SYMBOL_GPL(init_dummy_netdev);
7412
7413
7414 /**
7415  *      register_netdev - register a network device
7416  *      @dev: device to register
7417  *
7418  *      Take a completed network device structure and add it to the kernel
7419  *      interfaces. A %NETDEV_REGISTER message is sent to the netdev notifier
7420  *      chain. 0 is returned on success. A negative errno code is returned
7421  *      on a failure to set up the device, or if the name is a duplicate.
7422  *
7423  *      This is a wrapper around register_netdevice that takes the rtnl semaphore
7424  *      and expands the device name if you passed a format string to
7425  *      alloc_netdev.
7426  */
7427 int register_netdev(struct net_device *dev)
7428 {
7429         int err;
7430
7431         rtnl_lock();
7432         err = register_netdevice(dev);
7433         rtnl_unlock();
7434         return err;
7435 }
7436 EXPORT_SYMBOL(register_netdev);
7437
7438 int netdev_refcnt_read(const struct net_device *dev)
7439 {
7440         int i, refcnt = 0;
7441
7442         for_each_possible_cpu(i)
7443                 refcnt += *per_cpu_ptr(dev->pcpu_refcnt, i);
7444         return refcnt;
7445 }
7446 EXPORT_SYMBOL(netdev_refcnt_read);
7447
7448 /**
7449  * netdev_wait_allrefs - wait until all references are gone.
7450  * @dev: target net_device
7451  *
7452  * This is called when unregistering network devices.
7453  *
7454  * Any protocol or device that holds a reference should register
7455  * for netdevice notification, and cleanup and put back the
7456  * reference if they receive an UNREGISTER event.
7457  * We can get stuck here if buggy protocols don't correctly
7458  * call dev_put.
7459  */
7460 static void netdev_wait_allrefs(struct net_device *dev)
7461 {
7462         unsigned long rebroadcast_time, warning_time;
7463         int refcnt;
7464
7465         linkwatch_forget_dev(dev);
7466
7467         rebroadcast_time = warning_time = jiffies;
7468         refcnt = netdev_refcnt_read(dev);
7469
7470         while (refcnt != 0) {
7471                 if (time_after(jiffies, rebroadcast_time + 1 * HZ)) {
7472                         rtnl_lock();
7473
7474                         /* Rebroadcast unregister notification */
7475                         call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
7476
7477                         __rtnl_unlock();
7478                         rcu_barrier();
7479                         rtnl_lock();
7480
7481                         call_netdevice_notifiers(NETDEV_UNREGISTER_FINAL, dev);
7482                         if (test_bit(__LINK_STATE_LINKWATCH_PENDING,
7483                                      &dev->state)) {
7484                                 /* We must not have linkwatch events
7485                                  * pending on unregister. If this
7486                                  * happens, we simply run the queue
7487                                  * unscheduled, resulting in a noop
7488                                  * for this device.
7489                                  */
7490                                 linkwatch_run_queue();
7491                         }
7492
7493                         __rtnl_unlock();
7494
7495                         rebroadcast_time = jiffies;
7496                 }
7497
7498                 msleep(250);
7499
7500                 refcnt = netdev_refcnt_read(dev);
7501
7502                 if (time_after(jiffies, warning_time + 10 * HZ)) {
7503                         pr_emerg("unregister_netdevice: waiting for %s to become free. Usage count = %d\n",
7504                                  dev->name, refcnt);
7505                         warning_time = jiffies;
7506                 }
7507         }
7508 }
7509
7510 /* The sequence is:
7511  *
7512  *      rtnl_lock();
7513  *      ...
7514  *      register_netdevice(x1);
7515  *      register_netdevice(x2);
7516  *      ...
7517  *      unregister_netdevice(y1);
7518  *      unregister_netdevice(y2);
7519  *      ...
7520  *      rtnl_unlock();
7521  *      free_netdev(y1);
7522  *      free_netdev(y2);
7523  *
7524  * We are invoked by rtnl_unlock().
7525  * This allows us to deal with problems:
7526  * 1) We can delete sysfs objects which invoke hotplug
7527  *    without deadlocking with linkwatch via keventd.
7528  * 2) Since we run with the RTNL semaphore not held, we can sleep
7529  *    safely in order to wait for the netdev refcnt to drop to zero.
7530  *
7531  * We must not return until all unregister events added during
7532  * the interval the lock was held have been completed.
7533  */
7534 void netdev_run_todo(void)
7535 {
7536         struct list_head list;
7537
7538         /* Snapshot list, allow later requests */
7539         list_replace_init(&net_todo_list, &list);
7540
7541         __rtnl_unlock();
7542
7543
7544         /* Wait for rcu callbacks to finish before next phase */
7545         if (!list_empty(&list))
7546                 rcu_barrier();
7547
7548         while (!list_empty(&list)) {
7549                 struct net_device *dev
7550                         = list_first_entry(&list, struct net_device, todo_list);
7551                 list_del(&dev->todo_list);
7552
7553                 rtnl_lock();
7554                 call_netdevice_notifiers(NETDEV_UNREGISTER_FINAL, dev);
7555                 __rtnl_unlock();
7556
7557                 if (unlikely(dev->reg_state != NETREG_UNREGISTERING)) {
7558                         pr_err("network todo '%s' but state %d\n",
7559                                dev->name, dev->reg_state);
7560                         dump_stack();
7561                         continue;
7562                 }
7563
7564                 dev->reg_state = NETREG_UNREGISTERED;
7565
7566                 netdev_wait_allrefs(dev);
7567
7568                 /* paranoia */
7569                 BUG_ON(netdev_refcnt_read(dev));
7570                 BUG_ON(!list_empty(&dev->ptype_all));
7571                 BUG_ON(!list_empty(&dev->ptype_specific));
7572                 WARN_ON(rcu_access_pointer(dev->ip_ptr));
7573                 WARN_ON(rcu_access_pointer(dev->ip6_ptr));
7574                 WARN_ON(dev->dn_ptr);
7575
7576                 if (dev->destructor)
7577                         dev->destructor(dev);
7578
7579                 /* Report a network device has been unregistered */
7580                 rtnl_lock();
7581                 dev_net(dev)->dev_unreg_count--;
7582                 __rtnl_unlock();
7583                 wake_up(&netdev_unregistering_wq);
7584
7585                 /* Free network device */
7586                 kobject_put(&dev->dev.kobj);
7587         }
7588 }
7589
7590 /* Convert net_device_stats to rtnl_link_stats64. rtnl_link_stats64 has
7591  * all the same fields in the same order as net_device_stats, with only
7592  * the type differing, but rtnl_link_stats64 may have additional fields
7593  * at the end for newer counters.
7594  */
7595 void netdev_stats_to_stats64(struct rtnl_link_stats64 *stats64,
7596                              const struct net_device_stats *netdev_stats)
7597 {
7598 #if BITS_PER_LONG == 64
7599         BUILD_BUG_ON(sizeof(*stats64) < sizeof(*netdev_stats));
7600         memcpy(stats64, netdev_stats, sizeof(*netdev_stats));
7601         /* zero out counters that only exist in rtnl_link_stats64 */
7602         memset((char *)stats64 + sizeof(*netdev_stats), 0,
7603                sizeof(*stats64) - sizeof(*netdev_stats));
7604 #else
7605         size_t i, n = sizeof(*netdev_stats) / sizeof(unsigned long);
7606         const unsigned long *src = (const unsigned long *)netdev_stats;
7607         u64 *dst = (u64 *)stats64;
7608
7609         BUILD_BUG_ON(n > sizeof(*stats64) / sizeof(u64));
7610         for (i = 0; i < n; i++)
7611                 dst[i] = src[i];
7612         /* zero out counters that only exist in rtnl_link_stats64 */
7613         memset((char *)stats64 + n * sizeof(u64), 0,
7614                sizeof(*stats64) - n * sizeof(u64));
7615 #endif
7616 }
7617 EXPORT_SYMBOL(netdev_stats_to_stats64);
7618
7619 /**
7620  *      dev_get_stats   - get network device statistics
7621  *      @dev: device to get statistics from
7622  *      @storage: place to store stats
7623  *
7624  *      Get network statistics from device. Return @storage.
7625  *      The device driver may provide its own method by setting
7626  *      dev->netdev_ops->get_stats64 or dev->netdev_ops->get_stats;
7627  *      otherwise the internal statistics structure is used.
7628  */
7629 struct rtnl_link_stats64 *dev_get_stats(struct net_device *dev,
7630                                         struct rtnl_link_stats64 *storage)
7631 {
7632         const struct net_device_ops *ops = dev->netdev_ops;
7633
7634         if (ops->ndo_get_stats64) {
7635                 memset(storage, 0, sizeof(*storage));
7636                 ops->ndo_get_stats64(dev, storage);
7637         } else if (ops->ndo_get_stats) {
7638                 netdev_stats_to_stats64(storage, ops->ndo_get_stats(dev));
7639         } else {
7640                 netdev_stats_to_stats64(storage, &dev->stats);
7641         }
7642         storage->rx_dropped += (unsigned long)atomic_long_read(&dev->rx_dropped);
7643         storage->tx_dropped += (unsigned long)atomic_long_read(&dev->tx_dropped);
7644         storage->rx_nohandler += (unsigned long)atomic_long_read(&dev->rx_nohandler);
7645         return storage;
7646 }
7647 EXPORT_SYMBOL(dev_get_stats);
7648
7649 struct netdev_queue *dev_ingress_queue_create(struct net_device *dev)
7650 {
7651         struct netdev_queue *queue = dev_ingress_queue(dev);
7652
7653 #ifdef CONFIG_NET_CLS_ACT
7654         if (queue)
7655                 return queue;
7656         queue = kzalloc(sizeof(*queue), GFP_KERNEL);
7657         if (!queue)
7658                 return NULL;
7659         netdev_init_one_queue(dev, queue, NULL);
7660         RCU_INIT_POINTER(queue->qdisc, &noop_qdisc);
7661         queue->qdisc_sleeping = &noop_qdisc;
7662         rcu_assign_pointer(dev->ingress_queue, queue);
7663 #endif
7664         return queue;
7665 }
7666
7667 static const struct ethtool_ops default_ethtool_ops;
7668
7669 void netdev_set_default_ethtool_ops(struct net_device *dev,
7670                                     const struct ethtool_ops *ops)
7671 {
7672         if (dev->ethtool_ops == &default_ethtool_ops)
7673                 dev->ethtool_ops = ops;
7674 }
7675 EXPORT_SYMBOL_GPL(netdev_set_default_ethtool_ops);
7676
7677 void netdev_freemem(struct net_device *dev)
7678 {
7679         char *addr = (char *)dev - dev->padded;
7680
7681         kvfree(addr);
7682 }
7683
7684 /**
7685  *      alloc_netdev_mqs - allocate network device
7686  *      @sizeof_priv:           size of private data to allocate space for
7687  *      @name:                  device name format string
7688  *      @name_assign_type:      origin of device name
7689  *      @setup:                 callback to initialize device
7690  *      @txqs:                  the number of TX subqueues to allocate
7691  *      @rxqs:                  the number of RX subqueues to allocate
7692  *
7693  *      Allocates a struct net_device with private data area for driver use
7694  *      and performs basic initialization.  Also allocates subqueue structs
7695  *      for each queue on the device.
7696  */
7697 struct net_device *alloc_netdev_mqs(int sizeof_priv, const char *name,
7698                 unsigned char name_assign_type,
7699                 void (*setup)(struct net_device *),
7700                 unsigned int txqs, unsigned int rxqs)
7701 {
7702         struct net_device *dev;
7703         size_t alloc_size;
7704         struct net_device *p;
7705
7706         BUG_ON(strlen(name) >= sizeof(dev->name));
7707
7708         if (txqs < 1) {
7709                 pr_err("alloc_netdev: Unable to allocate device with zero queues\n");
7710                 return NULL;
7711         }
7712
7713 #ifdef CONFIG_SYSFS
7714         if (rxqs < 1) {
7715                 pr_err("alloc_netdev: Unable to allocate device with zero RX queues\n");
7716                 return NULL;
7717         }
7718 #endif
7719
7720         alloc_size = sizeof(struct net_device);
7721         if (sizeof_priv) {
7722                 /* ensure 32-byte alignment of private area */
7723                 alloc_size = ALIGN(alloc_size, NETDEV_ALIGN);
7724                 alloc_size += sizeof_priv;
7725         }
7726         /* ensure 32-byte alignment of whole construct */
7727         alloc_size += NETDEV_ALIGN - 1;
7728
7729         p = kzalloc(alloc_size, GFP_KERNEL | __GFP_NOWARN | __GFP_REPEAT);
7730         if (!p)
7731                 p = vzalloc(alloc_size);
7732         if (!p)
7733                 return NULL;
7734
7735         dev = PTR_ALIGN(p, NETDEV_ALIGN);
7736         dev->padded = (char *)dev - (char *)p;
7737
7738         dev->pcpu_refcnt = alloc_percpu(int);
7739         if (!dev->pcpu_refcnt)
7740                 goto free_dev;
7741
7742         if (dev_addr_init(dev))
7743                 goto free_pcpu;
7744
7745         dev_mc_init(dev);
7746         dev_uc_init(dev);
7747
7748         dev_net_set(dev, &init_net);
7749
7750         dev->gso_max_size = GSO_MAX_SIZE;
7751         dev->gso_max_segs = GSO_MAX_SEGS;
7752
7753         INIT_LIST_HEAD(&dev->napi_list);
7754         INIT_LIST_HEAD(&dev->unreg_list);
7755         INIT_LIST_HEAD(&dev->close_list);
7756         INIT_LIST_HEAD(&dev->link_watch_list);
7757         INIT_LIST_HEAD(&dev->adj_list.upper);
7758         INIT_LIST_HEAD(&dev->adj_list.lower);
7759         INIT_LIST_HEAD(&dev->all_adj_list.upper);
7760         INIT_LIST_HEAD(&dev->all_adj_list.lower);
7761         INIT_LIST_HEAD(&dev->ptype_all);
7762         INIT_LIST_HEAD(&dev->ptype_specific);
7763 #ifdef CONFIG_NET_SCHED
7764         hash_init(dev->qdisc_hash);
7765 #endif
7766         dev->priv_flags = IFF_XMIT_DST_RELEASE | IFF_XMIT_DST_RELEASE_PERM;
7767         setup(dev);
7768
7769         if (!dev->tx_queue_len) {
7770                 dev->priv_flags |= IFF_NO_QUEUE;
7771                 dev->tx_queue_len = 1;
7772         }
7773
7774         dev->num_tx_queues = txqs;
7775         dev->real_num_tx_queues = txqs;
7776         if (netif_alloc_netdev_queues(dev))
7777                 goto free_all;
7778
7779 #ifdef CONFIG_SYSFS
7780         dev->num_rx_queues = rxqs;
7781         dev->real_num_rx_queues = rxqs;
7782         if (netif_alloc_rx_queues(dev))
7783                 goto free_all;
7784 #endif
7785
7786         strcpy(dev->name, name);
7787         dev->name_assign_type = name_assign_type;
7788         dev->group = INIT_NETDEV_GROUP;
7789         if (!dev->ethtool_ops)
7790                 dev->ethtool_ops = &default_ethtool_ops;
7791
7792         nf_hook_ingress_init(dev);
7793
7794         return dev;
7795
7796 free_all:
7797         free_netdev(dev);
7798         return NULL;
7799
7800 free_pcpu:
7801         free_percpu(dev->pcpu_refcnt);
7802 free_dev:
7803         netdev_freemem(dev);
7804         return NULL;
7805 }
7806 EXPORT_SYMBOL(alloc_netdev_mqs);
7807
7808 /**
7809  *      free_netdev - free network device
7810  *      @dev: device
7811  *
7812  *      This function does the last stage of destroying an allocated device
7813  *      interface. The reference to the device object is released.
7814  *      If this is the last reference then it will be freed.
7815  *      Must be called in process context.
7816  */
7817 void free_netdev(struct net_device *dev)
7818 {
7819         struct napi_struct *p, *n;
7820
7821         might_sleep();
7822         netif_free_tx_queues(dev);
7823 #ifdef CONFIG_SYSFS
7824         kvfree(dev->_rx);
7825 #endif
7826
7827         kfree(rcu_dereference_protected(dev->ingress_queue, 1));
7828
7829         /* Flush device addresses */
7830         dev_addr_flush(dev);
7831
7832         list_for_each_entry_safe(p, n, &dev->napi_list, dev_list)
7833                 netif_napi_del(p);
7834
7835         free_percpu(dev->pcpu_refcnt);
7836         dev->pcpu_refcnt = NULL;
7837
7838         /*  Compatibility with error handling in drivers */
7839         if (dev->reg_state == NETREG_UNINITIALIZED) {
7840                 netdev_freemem(dev);
7841                 return;
7842         }
7843
7844         BUG_ON(dev->reg_state != NETREG_UNREGISTERED);
7845         dev->reg_state = NETREG_RELEASED;
7846
7847         /* will free via device release */
7848         put_device(&dev->dev);
7849 }
7850 EXPORT_SYMBOL(free_netdev);
7851
7852 /**
7853  *      synchronize_net -  Synchronize with packet receive processing
7854  *
7855  *      Wait for packets currently being received to be done.
7856  *      Does not block later packets from starting.
7857  */
7858 void synchronize_net(void)
7859 {
7860         might_sleep();
7861         if (rtnl_is_locked())
7862                 synchronize_rcu_expedited();
7863         else
7864                 synchronize_rcu();
7865 }
7866 EXPORT_SYMBOL(synchronize_net);
7867
7868 /**
7869  *      unregister_netdevice_queue - remove device from the kernel
7870  *      @dev: device
7871  *      @head: list
7872  *
7873  *      This function shuts down a device interface and removes it
7874  *      from the kernel tables.
7875  *      If head not NULL, device is queued to be unregistered later.
7876  *
7877  *      Callers must hold the rtnl semaphore.  You may want
7878  *      unregister_netdev() instead of this.
7879  */
7880
7881 void unregister_netdevice_queue(struct net_device *dev, struct list_head *head)
7882 {
7883         ASSERT_RTNL();
7884
7885         if (head) {
7886                 list_move_tail(&dev->unreg_list, head);
7887         } else {
7888                 rollback_registered(dev);
7889                 /* Finish processing unregister after unlock */
7890                 net_set_todo(dev);
7891         }
7892 }
7893 EXPORT_SYMBOL(unregister_netdevice_queue);
7894
7895 /**
7896  *      unregister_netdevice_many - unregister many devices
7897  *      @head: list of devices
7898  *
7899  *  Note: As most callers use a stack allocated list_head,
7900  *  we force a list_del() to make sure stack wont be corrupted later.
7901  */
7902 void unregister_netdevice_many(struct list_head *head)
7903 {
7904         struct net_device *dev;
7905
7906         if (!list_empty(head)) {
7907                 rollback_registered_many(head);
7908                 list_for_each_entry(dev, head, unreg_list)
7909                         net_set_todo(dev);
7910                 list_del(head);
7911         }
7912 }
7913 EXPORT_SYMBOL(unregister_netdevice_many);
7914
7915 /**
7916  *      unregister_netdev - remove device from the kernel
7917  *      @dev: device
7918  *
7919  *      This function shuts down a device interface and removes it
7920  *      from the kernel tables.
7921  *
7922  *      This is just a wrapper for unregister_netdevice that takes
7923  *      the rtnl semaphore.  In general you want to use this and not
7924  *      unregister_netdevice.
7925  */
7926 void unregister_netdev(struct net_device *dev)
7927 {
7928         rtnl_lock();
7929         unregister_netdevice(dev);
7930         rtnl_unlock();
7931 }
7932 EXPORT_SYMBOL(unregister_netdev);
7933
7934 /**
7935  *      dev_change_net_namespace - move device to different nethost namespace
7936  *      @dev: device
7937  *      @net: network namespace
7938  *      @pat: If not NULL name pattern to try if the current device name
7939  *            is already taken in the destination network namespace.
7940  *
7941  *      This function shuts down a device interface and moves it
7942  *      to a new network namespace. On success 0 is returned, on
7943  *      a failure a netagive errno code is returned.
7944  *
7945  *      Callers must hold the rtnl semaphore.
7946  */
7947
7948 int dev_change_net_namespace(struct net_device *dev, struct net *net, const char *pat)
7949 {
7950         int err;
7951
7952         ASSERT_RTNL();
7953
7954         /* Don't allow namespace local devices to be moved. */
7955         err = -EINVAL;
7956         if (dev->features & NETIF_F_NETNS_LOCAL)
7957                 goto out;
7958
7959         /* Ensure the device has been registrered */
7960         if (dev->reg_state != NETREG_REGISTERED)
7961                 goto out;
7962
7963         /* Get out if there is nothing todo */
7964         err = 0;
7965         if (net_eq(dev_net(dev), net))
7966                 goto out;
7967
7968         /* Pick the destination device name, and ensure
7969          * we can use it in the destination network namespace.
7970          */
7971         err = -EEXIST;
7972         if (__dev_get_by_name(net, dev->name)) {
7973                 /* We get here if we can't use the current device name */
7974                 if (!pat)
7975                         goto out;
7976                 err = dev_get_valid_name(net, dev, pat);
7977                 if (err < 0)
7978                         goto out;
7979         }
7980
7981         /*
7982          * And now a mini version of register_netdevice unregister_netdevice.
7983          */
7984
7985         /* If device is running close it first. */
7986         dev_close(dev);
7987
7988         /* And unlink it from device chain */
7989         unlist_netdevice(dev);
7990
7991         synchronize_net();
7992
7993         /* Shutdown queueing discipline. */
7994         dev_shutdown(dev);
7995
7996         /* Notify protocols, that we are about to destroy
7997            this device. They should clean all the things.
7998
7999            Note that dev->reg_state stays at NETREG_REGISTERED.
8000            This is wanted because this way 8021q and macvlan know
8001            the device is just moving and can keep their slaves up.
8002         */
8003         call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
8004         rcu_barrier();
8005         call_netdevice_notifiers(NETDEV_UNREGISTER_FINAL, dev);
8006         rtmsg_ifinfo(RTM_DELLINK, dev, ~0U, GFP_KERNEL);
8007
8008         /*
8009          *      Flush the unicast and multicast chains
8010          */
8011         dev_uc_flush(dev);
8012         dev_mc_flush(dev);
8013
8014         /* Send a netdev-removed uevent to the old namespace */
8015         kobject_uevent(&dev->dev.kobj, KOBJ_REMOVE);
8016         netdev_adjacent_del_links(dev);
8017
8018         /* Actually switch the network namespace */
8019         dev_net_set(dev, net);
8020
8021         /* If there is an ifindex conflict assign a new one */
8022         if (__dev_get_by_index(net, dev->ifindex))
8023                 dev->ifindex = dev_new_index(net);
8024
8025         /* Send a netdev-add uevent to the new namespace */
8026         kobject_uevent(&dev->dev.kobj, KOBJ_ADD);
8027         netdev_adjacent_add_links(dev);
8028
8029         /* Fixup kobjects */
8030         err = device_rename(&dev->dev, dev->name);
8031         WARN_ON(err);
8032
8033         /* Add the device back in the hashes */
8034         list_netdevice(dev);
8035
8036         /* Notify protocols, that a new device appeared. */
8037         call_netdevice_notifiers(NETDEV_REGISTER, dev);
8038
8039         /*
8040          *      Prevent userspace races by waiting until the network
8041          *      device is fully setup before sending notifications.
8042          */
8043         rtmsg_ifinfo(RTM_NEWLINK, dev, ~0U, GFP_KERNEL);
8044
8045         synchronize_net();
8046         err = 0;
8047 out:
8048         return err;
8049 }
8050 EXPORT_SYMBOL_GPL(dev_change_net_namespace);
8051
8052 static int dev_cpu_callback(struct notifier_block *nfb,
8053                             unsigned long action,
8054                             void *ocpu)
8055 {
8056         struct sk_buff **list_skb;
8057         struct sk_buff *skb;
8058         unsigned int cpu, oldcpu = (unsigned long)ocpu;
8059         struct softnet_data *sd, *oldsd;
8060
8061         if (action != CPU_DEAD && action != CPU_DEAD_FROZEN)
8062                 return NOTIFY_OK;
8063
8064         local_irq_disable();
8065         cpu = smp_processor_id();
8066         sd = &per_cpu(softnet_data, cpu);
8067         oldsd = &per_cpu(softnet_data, oldcpu);
8068
8069         /* Find end of our completion_queue. */
8070         list_skb = &sd->completion_queue;
8071         while (*list_skb)
8072                 list_skb = &(*list_skb)->next;
8073         /* Append completion queue from offline CPU. */
8074         *list_skb = oldsd->completion_queue;
8075         oldsd->completion_queue = NULL;
8076
8077         /* Append output queue from offline CPU. */
8078         if (oldsd->output_queue) {
8079                 *sd->output_queue_tailp = oldsd->output_queue;
8080                 sd->output_queue_tailp = oldsd->output_queue_tailp;
8081                 oldsd->output_queue = NULL;
8082                 oldsd->output_queue_tailp = &oldsd->output_queue;
8083         }
8084         /* Append NAPI poll list from offline CPU, with one exception :
8085          * process_backlog() must be called by cpu owning percpu backlog.
8086          * We properly handle process_queue & input_pkt_queue later.
8087          */
8088         while (!list_empty(&oldsd->poll_list)) {
8089                 struct napi_struct *napi = list_first_entry(&oldsd->poll_list,
8090                                                             struct napi_struct,
8091                                                             poll_list);
8092
8093                 list_del_init(&napi->poll_list);
8094                 if (napi->poll == process_backlog)
8095                         napi->state = 0;
8096                 else
8097                         ____napi_schedule(sd, napi);
8098         }
8099
8100         raise_softirq_irqoff(NET_TX_SOFTIRQ);
8101         local_irq_enable();
8102
8103         /* Process offline CPU's input_pkt_queue */
8104         while ((skb = __skb_dequeue(&oldsd->process_queue))) {
8105                 netif_rx_ni(skb);
8106                 input_queue_head_incr(oldsd);
8107         }
8108         while ((skb = skb_dequeue(&oldsd->input_pkt_queue))) {
8109                 netif_rx_ni(skb);
8110                 input_queue_head_incr(oldsd);
8111         }
8112
8113         return NOTIFY_OK;
8114 }
8115
8116
8117 /**
8118  *      netdev_increment_features - increment feature set by one
8119  *      @all: current feature set
8120  *      @one: new feature set
8121  *      @mask: mask feature set
8122  *
8123  *      Computes a new feature set after adding a device with feature set
8124  *      @one to the master device with current feature set @all.  Will not
8125  *      enable anything that is off in @mask. Returns the new feature set.
8126  */
8127 netdev_features_t netdev_increment_features(netdev_features_t all,
8128         netdev_features_t one, netdev_features_t mask)
8129 {
8130         if (mask & NETIF_F_HW_CSUM)
8131                 mask |= NETIF_F_CSUM_MASK;
8132         mask |= NETIF_F_VLAN_CHALLENGED;
8133
8134         all |= one & (NETIF_F_ONE_FOR_ALL | NETIF_F_CSUM_MASK) & mask;
8135         all &= one | ~NETIF_F_ALL_FOR_ALL;
8136
8137         /* If one device supports hw checksumming, set for all. */
8138         if (all & NETIF_F_HW_CSUM)
8139                 all &= ~(NETIF_F_CSUM_MASK & ~NETIF_F_HW_CSUM);
8140
8141         return all;
8142 }
8143 EXPORT_SYMBOL(netdev_increment_features);
8144
8145 static struct hlist_head * __net_init netdev_create_hash(void)
8146 {
8147         int i;
8148         struct hlist_head *hash;
8149
8150         hash = kmalloc(sizeof(*hash) * NETDEV_HASHENTRIES, GFP_KERNEL);
8151         if (hash != NULL)
8152                 for (i = 0; i < NETDEV_HASHENTRIES; i++)
8153                         INIT_HLIST_HEAD(&hash[i]);
8154
8155         return hash;
8156 }
8157
8158 /* Initialize per network namespace state */
8159 static int __net_init netdev_init(struct net *net)
8160 {
8161         if (net != &init_net)
8162                 INIT_LIST_HEAD(&net->dev_base_head);
8163
8164         net->dev_name_head = netdev_create_hash();
8165         if (net->dev_name_head == NULL)
8166                 goto err_name;
8167
8168         net->dev_index_head = netdev_create_hash();
8169         if (net->dev_index_head == NULL)
8170                 goto err_idx;
8171
8172         return 0;
8173
8174 err_idx:
8175         kfree(net->dev_name_head);
8176 err_name:
8177         return -ENOMEM;
8178 }
8179
8180 /**
8181  *      netdev_drivername - network driver for the device
8182  *      @dev: network device
8183  *
8184  *      Determine network driver for device.
8185  */
8186 const char *netdev_drivername(const struct net_device *dev)
8187 {
8188         const struct device_driver *driver;
8189         const struct device *parent;
8190         const char *empty = "";
8191
8192         parent = dev->dev.parent;
8193         if (!parent)
8194                 return empty;
8195
8196         driver = parent->driver;
8197         if (driver && driver->name)
8198                 return driver->name;
8199         return empty;
8200 }
8201
8202 static void __netdev_printk(const char *level, const struct net_device *dev,
8203                             struct va_format *vaf)
8204 {
8205         if (dev && dev->dev.parent) {
8206                 dev_printk_emit(level[1] - '0',
8207                                 dev->dev.parent,
8208                                 "%s %s %s%s: %pV",
8209                                 dev_driver_string(dev->dev.parent),
8210                                 dev_name(dev->dev.parent),
8211                                 netdev_name(dev), netdev_reg_state(dev),
8212                                 vaf);
8213         } else if (dev) {
8214                 printk("%s%s%s: %pV",
8215                        level, netdev_name(dev), netdev_reg_state(dev), vaf);
8216         } else {
8217                 printk("%s(NULL net_device): %pV", level, vaf);
8218         }
8219 }
8220
8221 void netdev_printk(const char *level, const struct net_device *dev,
8222                    const char *format, ...)
8223 {
8224         struct va_format vaf;
8225         va_list args;
8226
8227         va_start(args, format);
8228
8229         vaf.fmt = format;
8230         vaf.va = &args;
8231
8232         __netdev_printk(level, dev, &vaf);
8233
8234         va_end(args);
8235 }
8236 EXPORT_SYMBOL(netdev_printk);
8237
8238 #define define_netdev_printk_level(func, level)                 \
8239 void func(const struct net_device *dev, const char *fmt, ...)   \
8240 {                                                               \
8241         struct va_format vaf;                                   \
8242         va_list args;                                           \
8243                                                                 \
8244         va_start(args, fmt);                                    \
8245                                                                 \
8246         vaf.fmt = fmt;                                          \
8247         vaf.va = &args;                                         \
8248                                                                 \
8249         __netdev_printk(level, dev, &vaf);                      \
8250                                                                 \
8251         va_end(args);                                           \
8252 }                                                               \
8253 EXPORT_SYMBOL(func);
8254
8255 define_netdev_printk_level(netdev_emerg, KERN_EMERG);
8256 define_netdev_printk_level(netdev_alert, KERN_ALERT);
8257 define_netdev_printk_level(netdev_crit, KERN_CRIT);
8258 define_netdev_printk_level(netdev_err, KERN_ERR);
8259 define_netdev_printk_level(netdev_warn, KERN_WARNING);
8260 define_netdev_printk_level(netdev_notice, KERN_NOTICE);
8261 define_netdev_printk_level(netdev_info, KERN_INFO);
8262
8263 static void __net_exit netdev_exit(struct net *net)
8264 {
8265         kfree(net->dev_name_head);
8266         kfree(net->dev_index_head);
8267 }
8268
8269 static struct pernet_operations __net_initdata netdev_net_ops = {
8270         .init = netdev_init,
8271         .exit = netdev_exit,
8272 };
8273
8274 static void __net_exit default_device_exit(struct net *net)
8275 {
8276         struct net_device *dev, *aux;
8277         /*
8278          * Push all migratable network devices back to the
8279          * initial network namespace
8280          */
8281         rtnl_lock();
8282         for_each_netdev_safe(net, dev, aux) {
8283                 int err;
8284                 char fb_name[IFNAMSIZ];
8285
8286                 /* Ignore unmoveable devices (i.e. loopback) */
8287                 if (dev->features & NETIF_F_NETNS_LOCAL)
8288                         continue;
8289
8290                 /* Leave virtual devices for the generic cleanup */
8291                 if (dev->rtnl_link_ops)
8292                         continue;
8293
8294                 /* Push remaining network devices to init_net */
8295                 snprintf(fb_name, IFNAMSIZ, "dev%d", dev->ifindex);
8296                 err = dev_change_net_namespace(dev, &init_net, fb_name);
8297                 if (err) {
8298                         pr_emerg("%s: failed to move %s to init_net: %d\n",
8299                                  __func__, dev->name, err);
8300                         BUG();
8301                 }
8302         }
8303         rtnl_unlock();
8304 }
8305
8306 static void __net_exit rtnl_lock_unregistering(struct list_head *net_list)
8307 {
8308         /* Return with the rtnl_lock held when there are no network
8309          * devices unregistering in any network namespace in net_list.
8310          */
8311         struct net *net;
8312         bool unregistering;
8313         DEFINE_WAIT_FUNC(wait, woken_wake_function);
8314
8315         add_wait_queue(&netdev_unregistering_wq, &wait);
8316         for (;;) {
8317                 unregistering = false;
8318                 rtnl_lock();
8319                 list_for_each_entry(net, net_list, exit_list) {
8320                         if (net->dev_unreg_count > 0) {
8321                                 unregistering = true;
8322                                 break;
8323                         }
8324                 }
8325                 if (!unregistering)
8326                         break;
8327                 __rtnl_unlock();
8328
8329                 wait_woken(&wait, TASK_UNINTERRUPTIBLE, MAX_SCHEDULE_TIMEOUT);
8330         }
8331         remove_wait_queue(&netdev_unregistering_wq, &wait);
8332 }
8333
8334 static void __net_exit default_device_exit_batch(struct list_head *net_list)
8335 {
8336         /* At exit all network devices most be removed from a network
8337          * namespace.  Do this in the reverse order of registration.
8338          * Do this across as many network namespaces as possible to
8339          * improve batching efficiency.
8340          */
8341         struct net_device *dev;
8342         struct net *net;
8343         LIST_HEAD(dev_kill_list);
8344
8345         /* To prevent network device cleanup code from dereferencing
8346          * loopback devices or network devices that have been freed
8347          * wait here for all pending unregistrations to complete,
8348          * before unregistring the loopback device and allowing the
8349          * network namespace be freed.
8350          *
8351          * The netdev todo list containing all network devices
8352          * unregistrations that happen in default_device_exit_batch
8353          * will run in the rtnl_unlock() at the end of
8354          * default_device_exit_batch.
8355          */
8356         rtnl_lock_unregistering(net_list);
8357         list_for_each_entry(net, net_list, exit_list) {
8358                 for_each_netdev_reverse(net, dev) {
8359                         if (dev->rtnl_link_ops && dev->rtnl_link_ops->dellink)
8360                                 dev->rtnl_link_ops->dellink(dev, &dev_kill_list);
8361                         else
8362                                 unregister_netdevice_queue(dev, &dev_kill_list);
8363                 }
8364         }
8365         unregister_netdevice_many(&dev_kill_list);
8366         rtnl_unlock();
8367 }
8368
8369 static struct pernet_operations __net_initdata default_device_ops = {
8370         .exit = default_device_exit,
8371         .exit_batch = default_device_exit_batch,
8372 };
8373
8374 /*
8375  *      Initialize the DEV module. At boot time this walks the device list and
8376  *      unhooks any devices that fail to initialise (normally hardware not
8377  *      present) and leaves us with a valid list of present and active devices.
8378  *
8379  */
8380
8381 /*
8382  *       This is called single threaded during boot, so no need
8383  *       to take the rtnl semaphore.
8384  */
8385 static int __init net_dev_init(void)
8386 {
8387         int i, rc = -ENOMEM;
8388
8389         BUG_ON(!dev_boot_phase);
8390
8391         if (dev_proc_init())
8392                 goto out;
8393
8394         if (netdev_kobject_init())
8395                 goto out;
8396
8397         INIT_LIST_HEAD(&ptype_all);
8398         for (i = 0; i < PTYPE_HASH_SIZE; i++)
8399                 INIT_LIST_HEAD(&ptype_base[i]);
8400
8401         INIT_LIST_HEAD(&offload_base);
8402
8403         if (register_pernet_subsys(&netdev_net_ops))
8404                 goto out;
8405
8406         /*
8407          *      Initialise the packet receive queues.
8408          */
8409
8410         for_each_possible_cpu(i) {
8411                 struct work_struct *flush = per_cpu_ptr(&flush_works, i);
8412                 struct softnet_data *sd = &per_cpu(softnet_data, i);
8413
8414                 INIT_WORK(flush, flush_backlog);
8415
8416                 skb_queue_head_init(&sd->input_pkt_queue);
8417                 skb_queue_head_init(&sd->process_queue);
8418                 INIT_LIST_HEAD(&sd->poll_list);
8419                 sd->output_queue_tailp = &sd->output_queue;
8420 #ifdef CONFIG_RPS
8421                 sd->csd.func = rps_trigger_softirq;
8422                 sd->csd.info = sd;
8423                 sd->cpu = i;
8424 #endif
8425
8426                 sd->backlog.poll = process_backlog;
8427                 sd->backlog.weight = weight_p;
8428         }
8429
8430         dev_boot_phase = 0;
8431
8432         /* The loopback device is special if any other network devices
8433          * is present in a network namespace the loopback device must
8434          * be present. Since we now dynamically allocate and free the
8435          * loopback device ensure this invariant is maintained by
8436          * keeping the loopback device as the first device on the
8437          * list of network devices.  Ensuring the loopback devices
8438          * is the first device that appears and the last network device
8439          * that disappears.
8440          */
8441         if (register_pernet_device(&loopback_net_ops))
8442                 goto out;
8443
8444         if (register_pernet_device(&default_device_ops))
8445                 goto out;
8446
8447         open_softirq(NET_TX_SOFTIRQ, net_tx_action);
8448         open_softirq(NET_RX_SOFTIRQ, net_rx_action);
8449
8450         hotcpu_notifier(dev_cpu_callback, 0);
8451         dst_subsys_init();
8452         rc = 0;
8453 out:
8454         return rc;
8455 }
8456
8457 subsys_initcall(net_dev_init);