net/core/dev.c

   1 /*
   2  *      NET3    Protocol independent device support routines.
   3  *
   4  *              This program is free software; you can redistribute it and/or
   5  *              modify it under the terms of the GNU General Public License
   6  *              as published by the Free Software Foundation; either version
   7  *              2 of the License, or (at your option) any later version.
   8  *
   9  *      Derived from the non IP parts of dev.c 1.0.19
  10  *              Authors:        Ross Biro
  11  *                              Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
  12  *                              Mark Evans, <evansmp@uhura.aston.ac.uk>
  13  *
  14  *      Additional Authors:
  15  *              Florian la Roche <rzsfl@rz.uni-sb.de>
  16  *              Alan Cox <gw4pts@gw4pts.ampr.org>
  17  *              David Hinds <dahinds@users.sourceforge.net>
  18  *              Alexey Kuznetsov <kuznet@ms2.inr.ac.ru>
  19  *              Adam Sulmicki <adam@cfar.umd.edu>
  20  *              Pekka Riikonen <priikone@poesidon.pspt.fi>
  21  *
  22  *      Changes:
  23  *              D.J. Barrow     :       Fixed bug where dev->refcnt gets set
  24  *                                      to 2 if register_netdev gets called
  25  *                                      before net_dev_init & also removed a
  26  *                                      few lines of code in the process.
  27  *              Alan Cox        :       device private ioctl copies fields back.
  28  *              Alan Cox        :       Transmit queue code does relevant
  29  *                                      stunts to keep the queue safe.
  30  *              Alan Cox        :       Fixed double lock.
  31  *              Alan Cox        :       Fixed promisc NULL pointer trap
  32  *              ????????        :       Support the full private ioctl range
  33  *              Alan Cox        :       Moved ioctl permission check into
  34  *                                      drivers
  35  *              Tim Kordas      :       SIOCADDMULTI/SIOCDELMULTI
  36  *              Alan Cox        :       100 backlog just doesn't cut it when
  37  *                                      you start doing multicast video 8)
  38  *              Alan Cox        :       Rewrote net_bh and list manager.
  39  *              Alan Cox        :       Fix ETH_P_ALL echoback lengths.
  40  *              Alan Cox        :       Took out transmit every packet pass
  41  *                                      Saved a few bytes in the ioctl handler
  42  *              Alan Cox        :       Network driver sets packet type before
  43  *                                      calling netif_rx. Saves a function
  44  *                                      call a packet.
  45  *              Alan Cox        :       Hashed net_bh()
  46  *              Richard Kooijman:       Timestamp fixes.
  47  *              Alan Cox        :       Wrong field in SIOCGIFDSTADDR
  48  *              Alan Cox        :       Device lock protection.
  49  *              Alan Cox        :       Fixed nasty side effect of device close
  50  *                                      changes.
  51  *              Rudi Cilibrasi  :       Pass the right thing to
  52  *                                      set_mac_address()
  53  *              Dave Miller     :       32bit quantity for the device lock to
  54  *                                      make it work out on a Sparc.
  55  *              Bjorn Ekwall    :       Added KERNELD hack.
  56  *              Alan Cox        :       Cleaned up the backlog initialise.
  57  *              Craig Metz      :       SIOCGIFCONF fix if space for under
  58  *                                      1 device.
  59  *          Thomas Bogendoerfer :       Return ENODEV for dev_open, if there
  60  *                                      is no device open function.
  61  *              Andi Kleen      :       Fix error reporting for SIOCGIFCONF
  62  *          Michael Chastain    :       Fix signed/unsigned for SIOCGIFCONF
  63  *              Cyrus Durgin    :       Cleaned for KMOD
  64  *              Adam Sulmicki   :       Bug Fix : Network Device Unload
  65  *                                      A network device unload needs to purge
  66  *                                      the backlog queue.
  67  *      Paul Rusty Russell      :       SIOCSIFNAME
  68  *              Pekka Riikonen  :       Netdev boot-time settings code
  69  *              Andrew Morton   :       Make unregister_netdevice wait
  70  *                                      indefinitely on dev->refcnt
  71  *              J Hadi Salim    :       - Backlog queue sampling
  72  *                                      - netif_rx() feedback
  73  */
  74
  75 #include <asm/uaccess.h>
  76 #include <linux/bitops.h>
  77 #include <linux/capability.h>
  78 #include <linux/cpu.h>
  79 #include <linux/types.h>
  80 #include <linux/kernel.h>
  81 #include <linux/hash.h>
  82 #include <linux/slab.h>
  83 #include <linux/sched.h>
  84 #include <linux/mutex.h>
  85 #include <linux/string.h>
  86 #include <linux/mm.h>
  87 #include <linux/socket.h>
  88 #include <linux/sockios.h>
  89 #include <linux/errno.h>
  90 #include <linux/interrupt.h>
  91 #include <linux/if_ether.h>
  92 #include <linux/netdevice.h>
  93 #include <linux/etherdevice.h>
  94 #include <linux/ethtool.h>
  95 #include <linux/notifier.h>
  96 #include <linux/skbuff.h>
  97 #include <net/net_namespace.h>
  98 #include <net/sock.h>
  99 #include <linux/rtnetlink.h>
 100 #include <linux/stat.h>
 101 #include <net/dst.h>
 102 #include <net/pkt_sched.h>
 103 #include <net/checksum.h>
 104 #include <net/xfrm.h>
 105 #include <linux/highmem.h>
 106 #include <linux/init.h>
 107 #include <linux/module.h>
 108 #include <linux/netpoll.h>
 109 #include <linux/rcupdate.h>
 110 #include <linux/delay.h>
 111 #include <net/iw_handler.h>
 112 #include <asm/current.h>
 113 #include <linux/audit.h>
 114 #include <linux/dmaengine.h>
 115 #include <linux/err.h>
 116 #include <linux/ctype.h>
 117 #include <linux/if_arp.h>
 118 #include <linux/if_vlan.h>
 119 #include <linux/ip.h>
 120 #include <net/ip.h>
 121 #include <net/mpls.h>
 122 #include <linux/ipv6.h>
 123 #include <linux/in.h>
 124 #include <linux/jhash.h>
 125 #include <linux/random.h>
 126 #include <trace/events/napi.h>
 127 #include <trace/events/net.h>
 128 #include <trace/events/skb.h>
 129 #include <linux/pci.h>
 130 #include <linux/inetdevice.h>
 131 #include <linux/cpu_rmap.h>
 132 #include <linux/static_key.h>
 133 #include <linux/hashtable.h>
 134 #include <linux/vmalloc.h>
 135 #include <linux/if_macvlan.h>
 136 #include <linux/errqueue.h>
 137 #include <linux/hrtimer.h>
 138
 139 #include "net-sysfs.h"
 140
 141 /* Instead of increasing this, you should create a hash table. */
 142 #define MAX_GRO_SKBS 8
 143
 144 /* This should be increased if a protocol with a bigger head is added. */
 145 #define GRO_MAX_HEAD (MAX_HEADER + 128)
 146
 147 static DEFINE_SPINLOCK(ptype_lock);
 148 static DEFINE_SPINLOCK(offload_lock);
 149 struct list_head ptype_base[PTYPE_HASH_SIZE] __read_mostly;
 150 struct list_head ptype_all __read_mostly;       /* Taps */
 151 static struct list_head offload_base __read_mostly;
 152
 153 static int netif_rx_internal(struct sk_buff *skb);
 154 static int call_netdevice_notifiers_info(unsigned long val,
 155                                          struct net_device *dev,
 156                                          struct netdev_notifier_info *info);
 157
 158 /*
 159  * The @dev_base_head list is protected by @dev_base_lock and the rtnl
 160  * semaphore.
 161  *
 162  * Pure readers hold dev_base_lock for reading, or rcu_read_lock()
 163  *
 164  * Writers must hold the rtnl semaphore while they loop through the
 165  * dev_base_head list, and hold dev_base_lock for writing when they do the
 166  * actual updates.  This allows pure readers to access the list even
 167  * while a writer is preparing to update it.
 168  *
 169  * To put it another way, dev_base_lock is held for writing only to
 170  * protect against pure readers; the rtnl semaphore provides the
 171  * protection against other writers.
 172  *
 173  * See, for example usages, register_netdevice() and
 174  * unregister_netdevice(), which must be called with the rtnl
 175  * semaphore held.
 176  */
 177 DEFINE_RWLOCK(dev_base_lock);
 178 EXPORT_SYMBOL(dev_base_lock);
 179
 180 /* protects napi_hash addition/deletion and napi_gen_id */
 181 static DEFINE_SPINLOCK(napi_hash_lock);
 182
 183 static unsigned int napi_gen_id;
 184 static DEFINE_HASHTABLE(napi_hash, 8);
 185
 186 static seqcount_t devnet_rename_seq;
 187
 188 static inline void dev_base_seq_inc(struct net *net)
 189 {
 190         while (++net->dev_base_seq == 0);
 191 }
 192
 193 static inline struct hlist_head *dev_name_hash(struct net *net, const char *name)
 194 {
 195         unsigned int hash = full_name_hash(name, strnlen(name, IFNAMSIZ));
 196
 197         return &net->dev_name_head[hash_32(hash, NETDEV_HASHBITS)];
 198 }
 199
 200 static inline struct hlist_head *dev_index_hash(struct net *net, int ifindex)
 201 {
 202         return &net->dev_index_head[ifindex & (NETDEV_HASHENTRIES - 1)];
 203 }
 204
 205 static inline void rps_lock(struct softnet_data *sd)
 206 {
 207 #ifdef CONFIG_RPS
 208         spin_lock(&sd->input_pkt_queue.lock);
 209 #endif
 210 }
 211
 212 static inline void rps_unlock(struct softnet_data *sd)
 213 {
 214 #ifdef CONFIG_RPS
 215         spin_unlock(&sd->input_pkt_queue.lock);
 216 #endif
 217 }
 218
 219 /* Device list insertion */
 220 static void list_netdevice(struct net_device *dev)
 221 {
 222         struct net *net = dev_net(dev);
 223
 224         ASSERT_RTNL();
 225
 226         write_lock_bh(&dev_base_lock);
 227         list_add_tail_rcu(&dev->dev_list, &net->dev_base_head);
 228         hlist_add_head_rcu(&dev->name_hlist, dev_name_hash(net, dev->name));
 229         hlist_add_head_rcu(&dev->index_hlist,
 230                            dev_index_hash(net, dev->ifindex));
 231         write_unlock_bh(&dev_base_lock);
 232
 233         dev_base_seq_inc(net);
 234 }
 235
 236 /* Device list removal
 237  * caller must respect a RCU grace period before freeing/reusing dev
 238  */
 239 static void unlist_netdevice(struct net_device *dev)
 240 {
 241         ASSERT_RTNL();
 242
 243         /* Unlink dev from the device chain */
 244         write_lock_bh(&dev_base_lock);
 245         list_del_rcu(&dev->dev_list);
 246         hlist_del_rcu(&dev->name_hlist);
 247         hlist_del_rcu(&dev->index_hlist);
 248         write_unlock_bh(&dev_base_lock);
 249
 250         dev_base_seq_inc(dev_net(dev));
 251 }
 252
 253 /*
 254  *      Our notifier list
 255  */
 256
 257 static RAW_NOTIFIER_HEAD(netdev_chain);
 258
 259 /*
 260  *      Device drivers call our routines to queue packets here. We empty the
 261  *      queue in the local softnet handler.
 262  */
 263
 264 DEFINE_PER_CPU_ALIGNED(struct softnet_data, softnet_data);
 265 EXPORT_PER_CPU_SYMBOL(softnet_data);
 266
 267 #ifdef CONFIG_LOCKDEP
 268 /*
 269  * register_netdevice() inits txq->_xmit_lock and sets lockdep class
 270  * according to dev->type
 271  */
 272 static const unsigned short netdev_lock_type[] =
 273         {ARPHRD_NETROM, ARPHRD_ETHER, ARPHRD_EETHER, ARPHRD_AX25,
 274          ARPHRD_PRONET, ARPHRD_CHAOS, ARPHRD_IEEE802, ARPHRD_ARCNET,
 275          ARPHRD_APPLETLK, ARPHRD_DLCI, ARPHRD_ATM, ARPHRD_METRICOM,
 276          ARPHRD_IEEE1394, ARPHRD_EUI64, ARPHRD_INFINIBAND, ARPHRD_SLIP,
 277          ARPHRD_CSLIP, ARPHRD_SLIP6, ARPHRD_CSLIP6, ARPHRD_RSRVD,
 278          ARPHRD_ADAPT, ARPHRD_ROSE, ARPHRD_X25, ARPHRD_HWX25,
 279          ARPHRD_PPP, ARPHRD_CISCO, ARPHRD_LAPB, ARPHRD_DDCMP,
 280          ARPHRD_RAWHDLC, ARPHRD_TUNNEL, ARPHRD_TUNNEL6, ARPHRD_FRAD,
 281          ARPHRD_SKIP, ARPHRD_LOOPBACK, ARPHRD_LOCALTLK, ARPHRD_FDDI,
 282          ARPHRD_BIF, ARPHRD_SIT, ARPHRD_IPDDP, ARPHRD_IPGRE,
 283          ARPHRD_PIMREG, ARPHRD_HIPPI, ARPHRD_ASH, ARPHRD_ECONET,
 284          ARPHRD_IRDA, ARPHRD_FCPP, ARPHRD_FCAL, ARPHRD_FCPL,
 285          ARPHRD_FCFABRIC, ARPHRD_IEEE80211, ARPHRD_IEEE80211_PRISM,
 286          ARPHRD_IEEE80211_RADIOTAP, ARPHRD_PHONET, ARPHRD_PHONET_PIPE,
 287          ARPHRD_IEEE802154, ARPHRD_VOID, ARPHRD_NONE};
 288
 289 static const char *const netdev_lock_name[] =
 290         {"_xmit_NETROM", "_xmit_ETHER", "_xmit_EETHER", "_xmit_AX25",
 291          "_xmit_PRONET", "_xmit_CHAOS", "_xmit_IEEE802", "_xmit_ARCNET",
 292          "_xmit_APPLETLK", "_xmit_DLCI", "_xmit_ATM", "_xmit_METRICOM",
 293          "_xmit_IEEE1394", "_xmit_EUI64", "_xmit_INFINIBAND", "_xmit_SLIP",
 294          "_xmit_CSLIP", "_xmit_SLIP6", "_xmit_CSLIP6", "_xmit_RSRVD",
 295          "_xmit_ADAPT", "_xmit_ROSE", "_xmit_X25", "_xmit_HWX25",
 296          "_xmit_PPP", "_xmit_CISCO", "_xmit_LAPB", "_xmit_DDCMP",
 297          "_xmit_RAWHDLC", "_xmit_TUNNEL", "_xmit_TUNNEL6", "_xmit_FRAD",
 298          "_xmit_SKIP", "_xmit_LOOPBACK", "_xmit_LOCALTLK", "_xmit_FDDI",
 299          "_xmit_BIF", "_xmit_SIT", "_xmit_IPDDP", "_xmit_IPGRE",
 300          "_xmit_PIMREG", "_xmit_HIPPI", "_xmit_ASH", "_xmit_ECONET",
 301          "_xmit_IRDA", "_xmit_FCPP", "_xmit_FCAL", "_xmit_FCPL",
 302          "_xmit_FCFABRIC", "_xmit_IEEE80211", "_xmit_IEEE80211_PRISM",
 303          "_xmit_IEEE80211_RADIOTAP", "_xmit_PHONET", "_xmit_PHONET_PIPE",
 304          "_xmit_IEEE802154", "_xmit_VOID", "_xmit_NONE"};
 305
 306 static struct lock_class_key netdev_xmit_lock_key[ARRAY_SIZE(netdev_lock_type)];
 307 static struct lock_class_key netdev_addr_lock_key[ARRAY_SIZE(netdev_lock_type)];
 308
 309 static inline unsigned short netdev_lock_pos(unsigned short dev_type)
 310 {
 311         int i;
 312
 313         for (i = 0; i < ARRAY_SIZE(netdev_lock_type); i++)
 314                 if (netdev_lock_type[i] == dev_type)
 315                         return i;
 316         /* the last key is used by default */
 317         return ARRAY_SIZE(netdev_lock_type) - 1;
 318 }
 319
 320 static inline void netdev_set_xmit_lockdep_class(spinlock_t *lock,
 321                                                  unsigned short dev_type)
 322 {
 323         int i;
 324
 325         i = netdev_lock_pos(dev_type);
 326         lockdep_set_class_and_name(lock, &netdev_xmit_lock_key[i],
 327                                    netdev_lock_name[i]);
 328 }
 329
 330 static inline void netdev_set_addr_lockdep_class(struct net_device *dev)
 331 {
 332         int i;
 333
 334         i = netdev_lock_pos(dev->type);
 335         lockdep_set_class_and_name(&dev->addr_list_lock,
 336                                    &netdev_addr_lock_key[i],
 337                                    netdev_lock_name[i]);
 338 }
 339 #else
 340 static inline void netdev_set_xmit_lockdep_class(spinlock_t *lock,
 341                                                  unsigned short dev_type)
 342 {
 343 }
 344 static inline void netdev_set_addr_lockdep_class(struct net_device *dev)
 345 {
 346 }
 347 #endif
 348
 349 /*******************************************************************************
 350
 351                 Protocol management and registration routines
 352
 353 *******************************************************************************/
 354
 355 /*
 356  *      Add a protocol ID to the list. Now that the input handler is
 357  *      smarter we can dispense with all the messy stuff that used to be
 358  *      here.
 359  *
 360  *      BEWARE!!! Protocol handlers, mangling input packets,
 361  *      MUST BE last in hash buckets and checking protocol handlers
 362  *      MUST start from promiscuous ptype_all chain in net_bh.
 363  *      It is true now, do not change it.
 364  *      Explanation follows: if protocol handler, mangling packet, will
 365  *      be the first on list, it is not able to sense, that packet
 366  *      is cloned and should be copied-on-write, so that it will
 367  *      change it and subsequent readers will get broken packet.
 368  *                                                      --ANK (980803)
 369  */
 370
 371 static inline struct list_head *ptype_head(const struct packet_type *pt)
 372 {
 373         if (pt->type == htons(ETH_P_ALL))
 374                 return pt->dev ? &pt->dev->ptype_all : &ptype_all;
 375         else
 376                 return pt->dev ? &pt->dev->ptype_specific :
 377                                  &ptype_base[ntohs(pt->type) & PTYPE_HASH_MASK];
 378 }
 379
 380 /**
 381  *      dev_add_pack - add packet handler
 382  *      @pt: packet type declaration
 383  *
 384  *      Add a protocol handler to the networking stack. The passed &packet_type
 385  *      is linked into kernel lists and may not be freed until it has been
 386  *      removed from the kernel lists.
 387  *
 388  *      This call does not sleep therefore it can not
 389  *      guarantee all CPU's that are in middle of receiving packets
 390  *      will see the new packet type (until the next received packet).
 391  */
 392
 393 void dev_add_pack(struct packet_type *pt)
 394 {
 395         struct list_head *head = ptype_head(pt);
 396
 397         spin_lock(&ptype_lock);
 398         list_add_rcu(&pt->list, head);
 399         spin_unlock(&ptype_lock);
 400 }
 401 EXPORT_SYMBOL(dev_add_pack);
 402
 403 /**
 404  *      __dev_remove_pack        - remove packet handler
 405  *      @pt: packet type declaration
 406  *
 407  *      Remove a protocol handler that was previously added to the kernel
 408  *      protocol handlers by dev_add_pack(). The passed &packet_type is removed
 409  *      from the kernel lists and can be freed or reused once this function
 410  *      returns.
 411  *
 412  *      The packet type might still be in use by receivers
 413  *      and must not be freed until after all the CPU's have gone
 414  *      through a quiescent state.
 415  */
 416 void __dev_remove_pack(struct packet_type *pt)
 417 {
 418         struct list_head *head = ptype_head(pt);
 419         struct packet_type *pt1;
 420
 421         spin_lock(&ptype_lock);
 422
 423         list_for_each_entry(pt1, head, list) {
 424                 if (pt == pt1) {
 425                         list_del_rcu(&pt->list);
 426                         goto out;
 427                 }
 428         }
 429
 430         pr_warn("dev_remove_pack: %p not found\n", pt);
 431 out:
 432         spin_unlock(&ptype_lock);
 433 }
 434 EXPORT_SYMBOL(__dev_remove_pack);
 435
 436 /**
 437  *      dev_remove_pack  - remove packet handler
 438  *      @pt: packet type declaration
 439  *
 440  *      Remove a protocol handler that was previously added to the kernel
 441  *      protocol handlers by dev_add_pack(). The passed &packet_type is removed
 442  *      from the kernel lists and can be freed or reused once this function
 443  *      returns.
 444  *
 445  *      This call sleeps to guarantee that no CPU is looking at the packet
 446  *      type after return.
 447  */
 448 void dev_remove_pack(struct packet_type *pt)
 449 {
 450         __dev_remove_pack(pt);
 451
 452         synchronize_net();
 453 }
 454 EXPORT_SYMBOL(dev_remove_pack);
 455
 456
 457 /**
 458  *      dev_add_offload - register offload handlers
 459  *      @po: protocol offload declaration
 460  *
 461  *      Add protocol offload handlers to the networking stack. The passed
 462  *      &proto_offload is linked into kernel lists and may not be freed until
 463  *      it has been removed from the kernel lists.
 464  *
 465  *      This call does not sleep therefore it can not
 466  *      guarantee all CPU's that are in middle of receiving packets
 467  *      will see the new offload handlers (until the next received packet).
 468  */
 469 void dev_add_offload(struct packet_offload *po)
 470 {
 471         struct list_head *head = &offload_base;
 472
 473         spin_lock(&offload_lock);
 474         list_add_rcu(&po->list, head);
 475         spin_unlock(&offload_lock);
 476 }
 477 EXPORT_SYMBOL(dev_add_offload);
 478
 479 /**
 480  *      __dev_remove_offload     - remove offload handler
 481  *      @po: packet offload declaration
 482  *
 483  *      Remove a protocol offload handler that was previously added to the
 484  *      kernel offload handlers by dev_add_offload(). The passed &offload_type
 485  *      is removed from the kernel lists and can be freed or reused once this
 486  *      function returns.
 487  *
 488  *      The packet type might still be in use by receivers
 489  *      and must not be freed until after all the CPU's have gone
 490  *      through a quiescent state.
 491  */
 492 static void __dev_remove_offload(struct packet_offload *po)
 493 {
 494         struct list_head *head = &offload_base;
 495         struct packet_offload *po1;
 496
 497         spin_lock(&offload_lock);
 498
 499         list_for_each_entry(po1, head, list) {
 500                 if (po == po1) {
 501                         list_del_rcu(&po->list);
 502                         goto out;
 503                 }
 504         }
 505
 506         pr_warn("dev_remove_offload: %p not found\n", po);
 507 out:
 508         spin_unlock(&offload_lock);
 509 }
 510
 511 /**
 512  *      dev_remove_offload       - remove packet offload handler
 513  *      @po: packet offload declaration
 514  *
 515  *      Remove a packet offload handler that was previously added to the kernel
 516  *      offload handlers by dev_add_offload(). The passed &offload_type is
 517  *      removed from the kernel lists and can be freed or reused once this
 518  *      function returns.
 519  *
 520  *      This call sleeps to guarantee that no CPU is looking at the packet
 521  *      type after return.
 522  */
 523 void dev_remove_offload(struct packet_offload *po)
 524 {
 525         __dev_remove_offload(po);
 526
 527         synchronize_net();
 528 }
 529 EXPORT_SYMBOL(dev_remove_offload);
 530
 531 /******************************************************************************
 532
 533                       Device Boot-time Settings Routines
 534
 535 *******************************************************************************/
 536
 537 /* Boot time configuration table */
 538 static struct netdev_boot_setup dev_boot_setup[NETDEV_BOOT_SETUP_MAX];
 539
 540 /**
 541  *      netdev_boot_setup_add   - add new setup entry
 542  *      @name: name of the device
 543  *      @map: configured settings for the device
 544  *
 545  *      Adds new setup entry to the dev_boot_setup list.  The function
 546  *      returns 0 on error and 1 on success.  This is a generic routine to
 547  *      all netdevices.
 548  */
 549 static int netdev_boot_setup_add(char *name, struct ifmap *map)
 550 {
 551         struct netdev_boot_setup *s;
 552         int i;
 553
 554         s = dev_boot_setup;
 555         for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++) {
 556                 if (s[i].name[0] == '\0' || s[i].name[0] == ' ') {
 557                         memset(s[i].name, 0, sizeof(s[i].name));
 558                         strlcpy(s[i].name, name, IFNAMSIZ);
 559                         memcpy(&s[i].map, map, sizeof(s[i].map));
 560                         break;
 561                 }
 562         }
 563
 564         return i >= NETDEV_BOOT_SETUP_MAX ? 0 : 1;
 565 }
 566
 567 /**
 568  *      netdev_boot_setup_check - check boot time settings
 569  *      @dev: the netdevice
 570  *
 571  *      Check boot time settings for the device.
 572  *      The found settings are set for the device to be used
 573  *      later in the device probing.
 574  *      Returns 0 if no settings found, 1 if they are.
 575  */
 576 int netdev_boot_setup_check(struct net_device *dev)
 577 {
 578         struct netdev_boot_setup *s = dev_boot_setup;
 579         int i;
 580
 581         for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++) {
 582                 if (s[i].name[0] != '\0' && s[i].name[0] != ' ' &&
 583                     !strcmp(dev->name, s[i].name)) {
 584                         dev->irq        = s[i].map.irq;
 585                         dev->base_addr  = s[i].map.base_addr;
 586                         dev->mem_start  = s[i].map.mem_start;
 587                         dev->mem_end    = s[i].map.mem_end;
 588                         return 1;
 589                 }
 590         }
 591         return 0;
 592 }
 593 EXPORT_SYMBOL(netdev_boot_setup_check);
 594
 595
 596 /**
 597  *      netdev_boot_base        - get address from boot time settings
 598  *      @prefix: prefix for network device
 599  *      @unit: id for network device
 600  *
 601  *      Check boot time settings for the base address of device.
 602  *      The found settings are set for the device to be used
 603  *      later in the device probing.
 604  *      Returns 0 if no settings found.
 605  */
 606 unsigned long netdev_boot_base(const char *prefix, int unit)
 607 {
 608         const struct netdev_boot_setup *s = dev_boot_setup;
 609         char name[IFNAMSIZ];
 610         int i;
 611
 612         sprintf(name, "%s%d", prefix, unit);
 613
 614         /*
 615          * If device already registered then return base of 1
 616          * to indicate not to probe for this interface
 617          */
 618         if (__dev_get_by_name(&init_net, name))
 619                 return 1;
 620
 621         for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++)
 622                 if (!strcmp(name, s[i].name))
 623                         return s[i].map.base_addr;
 624         return 0;
 625 }
 626
 627 /*
 628  * Saves at boot time configured settings for any netdevice.
 629  */
 630 int __init netdev_boot_setup(char *str)
 631 {
 632         int ints[5];
 633         struct ifmap map;
 634
 635         str = get_options(str, ARRAY_SIZE(ints), ints);
 636         if (!str || !*str)
 637                 return 0;
 638
 639         /* Save settings */
 640         memset(&map, 0, sizeof(map));
 641         if (ints[0] > 0)
 642                 map.irq = ints[1];
 643         if (ints[0] > 1)
 644                 map.base_addr = ints[2];
 645         if (ints[0] > 2)
 646                 map.mem_start = ints[3];
 647         if (ints[0] > 3)
 648                 map.mem_end = ints[4];
 649
 650         /* Add new entry to the list */
 651         return netdev_boot_setup_add(str, &map);
 652 }
 653
 654 __setup("netdev=", netdev_boot_setup);
 655
 656 /*******************************************************************************
 657
 658                             Device Interface Subroutines
 659
 660 *******************************************************************************/
 661
 662 /**
 663  *      dev_get_iflink  - get 'iflink' value of a interface
 664  *      @dev: targeted interface
 665  *
 666  *      Indicates the ifindex the interface is linked to.
 667  *      Physical interfaces have the same 'ifindex' and 'iflink' values.
 668  */
 669
 670 int dev_get_iflink(const struct net_device *dev)
 671 {
 672         if (dev->netdev_ops && dev->netdev_ops->ndo_get_iflink)
 673                 return dev->netdev_ops->ndo_get_iflink(dev);
 674
 675         return dev->ifindex;
 676 }
 677 EXPORT_SYMBOL(dev_get_iflink);
 678
 679 /**
 680  *      __dev_get_by_name       - find a device by its name
 681  *      @net: the applicable net namespace
 682  *      @name: name to find
 683  *
 684  *      Find an interface by name. Must be called under RTNL semaphore
 685  *      or @dev_base_lock. If the name is found a pointer to the device
 686  *      is returned. If the name is not found then %NULL is returned. The
 687  *      reference counters are not incremented so the caller must be
 688  *      careful with locks.
 689  */
 690
 691 struct net_device *__dev_get_by_name(struct net *net, const char *name)
 692 {
 693         struct net_device *dev;
 694         struct hlist_head *head = dev_name_hash(net, name);
 695
 696         hlist_for_each_entry(dev, head, name_hlist)
 697                 if (!strncmp(dev->name, name, IFNAMSIZ))
 698                         return dev;
 699
 700         return NULL;
 701 }
 702 EXPORT_SYMBOL(__dev_get_by_name);
 703
 704 /**
 705  *      dev_get_by_name_rcu     - find a device by its name
 706  *      @net: the applicable net namespace
 707  *      @name: name to find
 708  *
 709  *      Find an interface by name.
 710  *      If the name is found a pointer to the device is returned.
 711  *      If the name is not found then %NULL is returned.
 712  *      The reference counters are not incremented so the caller must be
 713  *      careful with locks. The caller must hold RCU lock.
 714  */
 715
 716 struct net_device *dev_get_by_name_rcu(struct net *net, const char *name)
 717 {
 718         struct net_device *dev;
 719         struct hlist_head *head = dev_name_hash(net, name);
 720
 721         hlist_for_each_entry_rcu(dev, head, name_hlist)
 722                 if (!strncmp(dev->name, name, IFNAMSIZ))
 723                         return dev;
 724
 725         return NULL;
 726 }
 727 EXPORT_SYMBOL(dev_get_by_name_rcu);
 728
 729 /**
 730  *      dev_get_by_name         - find a device by its name
 731  *      @net: the applicable net namespace
 732  *      @name: name to find
 733  *
 734  *      Find an interface by name. This can be called from any
 735  *      context and does its own locking. The returned handle has
 736  *      the usage count incremented and the caller must use dev_put() to
 737  *      release it when it is no longer needed. %NULL is returned if no
 738  *      matching device is found.
 739  */
 740
 741 struct net_device *dev_get_by_name(struct net *net, const char *name)
 742 {
 743         struct net_device *dev;
 744
 745         rcu_read_lock();
 746         dev = dev_get_by_name_rcu(net, name);
 747         if (dev)
 748                 dev_hold(dev);
 749         rcu_read_unlock();
 750         return dev;
 751 }
 752 EXPORT_SYMBOL(dev_get_by_name);
 753
 754 /**
 755  *      __dev_get_by_index - find a device by its ifindex
 756  *      @net: the applicable net namespace
 757  *      @ifindex: index of device
 758  *
 759  *      Search for an interface by index. Returns %NULL if the device
 760  *      is not found or a pointer to the device. The device has not
 761  *      had its reference counter increased so the caller must be careful
 762  *      about locking. The caller must hold either the RTNL semaphore
 763  *      or @dev_base_lock.
 764  */
 765
 766 struct net_device *__dev_get_by_index(struct net *net, int ifindex)
 767 {
 768         struct net_device *dev;
 769         struct hlist_head *head = dev_index_hash(net, ifindex);
 770
 771         hlist_for_each_entry(dev, head, index_hlist)
 772                 if (dev->ifindex == ifindex)
 773                         return dev;
 774
 775         return NULL;
 776 }
 777 EXPORT_SYMBOL(__dev_get_by_index);
 778
 779 /**
 780  *      dev_get_by_index_rcu - find a device by its ifindex
 781  *      @net: the applicable net namespace
 782  *      @ifindex: index of device
 783  *
 784  *      Search for an interface by index. Returns %NULL if the device
 785  *      is not found or a pointer to the device. The device has not
 786  *      had its reference counter increased so the caller must be careful
 787  *      about locking. The caller must hold RCU lock.
 788  */
 789
 790 struct net_device *dev_get_by_index_rcu(struct net *net, int ifindex)
 791 {
 792         struct net_device *dev;
 793         struct hlist_head *head = dev_index_hash(net, ifindex);
 794
 795         hlist_for_each_entry_rcu(dev, head, index_hlist)
 796                 if (dev->ifindex == ifindex)
 797                         return dev;
 798
 799         return NULL;
 800 }
 801 EXPORT_SYMBOL(dev_get_by_index_rcu);
 802
 803
 804 /**
 805  *      dev_get_by_index - find a device by its ifindex
 806  *      @net: the applicable net namespace
 807  *      @ifindex: index of device
 808  *
 809  *      Search for an interface by index. Returns NULL if the device
 810  *      is not found or a pointer to the device. The device returned has
 811  *      had a reference added and the pointer is safe until the user calls
 812  *      dev_put to indicate they have finished with it.
 813  */
 814
 815 struct net_device *dev_get_by_index(struct net *net, int ifindex)
 816 {
 817         struct net_device *dev;
 818
 819         rcu_read_lock();
 820         dev = dev_get_by_index_rcu(net, ifindex);
 821         if (dev)
 822                 dev_hold(dev);
 823         rcu_read_unlock();
 824         return dev;
 825 }
 826 EXPORT_SYMBOL(dev_get_by_index);
 827
 828 /**
 829  *      netdev_get_name - get a netdevice name, knowing its ifindex.
 830  *      @net: network namespace
 831  *      @name: a pointer to the buffer where the name will be stored.
 832  *      @ifindex: the ifindex of the interface to get the name from.
 833  *
 834  *      The use of raw_seqcount_begin() and cond_resched() before
 835  *      retrying is required as we want to give the writers a chance
 836  *      to complete when CONFIG_PREEMPT is not set.
 837  */
 838 int netdev_get_name(struct net *net, char *name, int ifindex)
 839 {
 840         struct net_device *dev;
 841         unsigned int seq;
 842
 843 retry:
 844         seq = raw_seqcount_begin(&devnet_rename_seq);
 845         rcu_read_lock();
 846         dev = dev_get_by_index_rcu(net, ifindex);
 847         if (!dev) {
 848                 rcu_read_unlock();
 849                 return -ENODEV;
 850         }
 851
 852         strcpy(name, dev->name);
 853         rcu_read_unlock();
 854         if (read_seqcount_retry(&devnet_rename_seq, seq)) {
 855                 cond_resched();
 856                 goto retry;
 857         }
 858
 859         return 0;
 860 }
 861
 862 /**
 863  *      dev_getbyhwaddr_rcu - find a device by its hardware address
 864  *      @net: the applicable net namespace
 865  *      @type: media type of device
 866  *      @ha: hardware address
 867  *
 868  *      Search for an interface by MAC address. Returns NULL if the device
 869  *      is not found or a pointer to the device.
 870  *      The caller must hold RCU or RTNL.
 871  *      The returned device has not had its ref count increased
 872  *      and the caller must therefore be careful about locking
 873  *
 874  */
 875
 876 struct net_device *dev_getbyhwaddr_rcu(struct net *net, unsigned short type,
 877                                        const char *ha)
 878 {
 879         struct net_device *dev;
 880
 881         for_each_netdev_rcu(net, dev)
 882                 if (dev->type == type &&
 883                     !memcmp(dev->dev_addr, ha, dev->addr_len))
 884                         return dev;
 885
 886         return NULL;
 887 }
 888 EXPORT_SYMBOL(dev_getbyhwaddr_rcu);
 889
 890 struct net_device *__dev_getfirstbyhwtype(struct net *net, unsigned short type)
 891 {
 892         struct net_device *dev;
 893
 894         ASSERT_RTNL();
 895         for_each_netdev(net, dev)
 896                 if (dev->type == type)
 897                         return dev;
 898
 899         return NULL;
 900 }
 901 EXPORT_SYMBOL(__dev_getfirstbyhwtype);
 902
 903 struct net_device *dev_getfirstbyhwtype(struct net *net, unsigned short type)
 904 {
 905         struct net_device *dev, *ret = NULL;
 906
 907         rcu_read_lock();
 908         for_each_netdev_rcu(net, dev)
 909                 if (dev->type == type) {
 910                         dev_hold(dev);
 911                         ret = dev;
 912                         break;
 913                 }
 914         rcu_read_unlock();
 915         return ret;
 916 }
 917 EXPORT_SYMBOL(dev_getfirstbyhwtype);
 918
 919 /**
 920  *      __dev_get_by_flags - find any device with given flags
 921  *      @net: the applicable net namespace
 922  *      @if_flags: IFF_* values
 923  *      @mask: bitmask of bits in if_flags to check
 924  *
 925  *      Search for any interface with the given flags. Returns NULL if a device
 926  *      is not found or a pointer to the device. Must be called inside
 927  *      rtnl_lock(), and result refcount is unchanged.
 928  */
 929
 930 struct net_device *__dev_get_by_flags(struct net *net, unsigned short if_flags,
 931                                       unsigned short mask)
 932 {
 933         struct net_device *dev, *ret;
 934
 935         ASSERT_RTNL();
 936
 937         ret = NULL;
 938         for_each_netdev(net, dev) {
 939                 if (((dev->flags ^ if_flags) & mask) == 0) {
 940                         ret = dev;
 941                         break;
 942                 }
 943         }
 944         return ret;
 945 }
 946 EXPORT_SYMBOL(__dev_get_by_flags);
 947
 948 /**
 949  *      dev_valid_name - check if name is okay for network device
 950  *      @name: name string
 951  *
 952  *      Network device names need to be valid file names to
 953  *      to allow sysfs to work.  We also disallow any kind of
 954  *      whitespace.
 955  */
 956 bool dev_valid_name(const char *name)
 957 {
 958         if (*name == '\0')
 959                 return false;
 960         if (strlen(name) >= IFNAMSIZ)
 961                 return false;
 962         if (!strcmp(name, ".") || !strcmp(name, ".."))
 963                 return false;
 964
 965         while (*name) {
 966                 if (*name == '/' || *name == ':' || isspace(*name))
 967                         return false;
 968                 name++;
 969         }
 970         return true;
 971 }
 972 EXPORT_SYMBOL(dev_valid_name);
 973
 974 /**
 975  *      __dev_alloc_name - allocate a name for a device
 976  *      @net: network namespace to allocate the device name in
 977  *      @name: name format string
 978  *      @buf:  scratch buffer and result name string
 979  *
 980  *      Passed a format string - eg "lt%d" it will try and find a suitable
 981  *      id. It scans list of devices to build up a free map, then chooses
 982  *      the first empty slot. The caller must hold the dev_base or rtnl lock
 983  *      while allocating the name and adding the device in order to avoid
 984  *      duplicates.
 985  *      Limited to bits_per_byte * page size devices (ie 32K on most platforms).
 986  *      Returns the number of the unit assigned or a negative errno code.
 987  */
 988
 989 static int __dev_alloc_name(struct net *net, const char *name, char *buf)
 990 {
 991         int i = 0;
 992         const char *p;
 993         const int max_netdevices = 8*PAGE_SIZE;
 994         unsigned long *inuse;
 995         struct net_device *d;
 996
 997         p = strnchr(name, IFNAMSIZ-1, '%');
 998         if (p) {
 999                 /*
1000                  * Verify the string as this thing may have come from
1001                  * the user.  There must be either one "%d" and no other "%"
1002                  * characters.
1003                  */
1004                 if (p[1] != 'd' || strchr(p + 2, '%'))
1005                         return -EINVAL;
1006
1007                 /* Use one page as a bit array of possible slots */
1008                 inuse = (unsigned long *) get_zeroed_page(GFP_ATOMIC);
1009                 if (!inuse)
1010                         return -ENOMEM;
1011
1012                 for_each_netdev(net, d) {
1013                         if (!sscanf(d->name, name, &i))
1014                                 continue;
1015                         if (i < 0 || i >= max_netdevices)
1016                                 continue;
1017
1018                         /*  avoid cases where sscanf is not exact inverse of printf */
1019                         snprintf(buf, IFNAMSIZ, name, i);
1020                         if (!strncmp(buf, d->name, IFNAMSIZ))
1021                                 set_bit(i, inuse);
1022                 }
1023
1024                 i = find_first_zero_bit(inuse, max_netdevices);
1025                 free_page((unsigned long) inuse);
1026         }
1027
1028         if (buf != name)
1029                 snprintf(buf, IFNAMSIZ, name, i);
1030         if (!__dev_get_by_name(net, buf))
1031                 return i;
1032
1033         /* It is possible to run out of possible slots
1034          * when the name is long and there isn't enough space left
1035          * for the digits, or if all bits are used.
1036          */
1037         return -ENFILE;
1038 }
1039
1040 /**
1041  *      dev_alloc_name - allocate a name for a device
1042  *      @dev: device
1043  *      @name: name format string
1044  *
1045  *      Passed a format string - eg "lt%d" it will try and find a suitable
1046  *      id. It scans list of devices to build up a free map, then chooses
1047  *      the first empty slot. The caller must hold the dev_base or rtnl lock
1048  *      while allocating the name and adding the device in order to avoid
1049  *      duplicates.
1050  *      Limited to bits_per_byte * page size devices (ie 32K on most platforms).
1051  *      Returns the number of the unit assigned or a negative errno code.
1052  */
1053
1054 int dev_alloc_name(struct net_device *dev, const char *name)
1055 {
1056         char buf[IFNAMSIZ];
1057         struct net *net;
1058         int ret;
1059
1060         BUG_ON(!dev_net(dev));
1061         net = dev_net(dev);
1062         ret = __dev_alloc_name(net, name, buf);
1063         if (ret >= 0)
1064                 strlcpy(dev->name, buf, IFNAMSIZ);
1065         return ret;
1066 }
1067 EXPORT_SYMBOL(dev_alloc_name);
1068
1069 static int dev_alloc_name_ns(struct net *net,
1070                              struct net_device *dev,
1071                              const char *name)
1072 {
1073         char buf[IFNAMSIZ];
1074         int ret;
1075
1076         ret = __dev_alloc_name(net, name, buf);
1077         if (ret >= 0)
1078                 strlcpy(dev->name, buf, IFNAMSIZ);
1079         return ret;
1080 }
1081
1082 static int dev_get_valid_name(struct net *net,
1083                               struct net_device *dev,
1084                               const char *name)
1085 {
1086         BUG_ON(!net);
1087
1088         if (!dev_valid_name(name))
1089                 return -EINVAL;
1090
1091         if (strchr(name, '%'))
1092                 return dev_alloc_name_ns(net, dev, name);
1093         else if (__dev_get_by_name(net, name))
1094                 return -EEXIST;
1095         else if (dev->name != name)
1096                 strlcpy(dev->name, name, IFNAMSIZ);
1097
1098         return 0;
1099 }
1100
1101 /**
1102  *      dev_change_name - change name of a device
1103  *      @dev: device
1104  *      @newname: name (or format string) must be at least IFNAMSIZ
1105  *
1106  *      Change name of a device, can pass format strings "eth%d".
1107  *      for wildcarding.
1108  */
1109 int dev_change_name(struct net_device *dev, const char *newname)
1110 {
1111         unsigned char old_assign_type;
1112         char oldname[IFNAMSIZ];
1113         int err = 0;
1114         int ret;
1115         struct net *net;
1116
1117         ASSERT_RTNL();
1118         BUG_ON(!dev_net(dev));
1119
1120         net = dev_net(dev);
1121         if (dev->flags & IFF_UP)
1122                 return -EBUSY;
1123
1124         write_seqcount_begin(&devnet_rename_seq);
1125
1126         if (strncmp(newname, dev->name, IFNAMSIZ) == 0) {
1127                 write_seqcount_end(&devnet_rename_seq);
1128                 return 0;
1129         }
1130
1131         memcpy(oldname, dev->name, IFNAMSIZ);
1132
1133         err = dev_get_valid_name(net, dev, newname);
1134         if (err < 0) {
1135                 write_seqcount_end(&devnet_rename_seq);
1136                 return err;
1137         }
1138
1139         if (oldname[0] && !strchr(oldname, '%'))
1140                 netdev_info(dev, "renamed from %s\n", oldname);
1141
1142         old_assign_type = dev->name_assign_type;
1143         dev->name_assign_type = NET_NAME_RENAMED;
1144
1145 rollback:
1146         ret = device_rename(&dev->dev, dev->name);
1147         if (ret) {
1148                 memcpy(dev->name, oldname, IFNAMSIZ);
1149                 dev->name_assign_type = old_assign_type;
1150                 write_seqcount_end(&devnet_rename_seq);
1151                 return ret;
1152         }
1153
1154         write_seqcount_end(&devnet_rename_seq);
1155
1156         netdev_adjacent_rename_links(dev, oldname);
1157
1158         write_lock_bh(&dev_base_lock);
1159         hlist_del_rcu(&dev->name_hlist);
1160         write_unlock_bh(&dev_base_lock);
1161
1162         synchronize_rcu();
1163
1164         write_lock_bh(&dev_base_lock);
1165         hlist_add_head_rcu(&dev->name_hlist, dev_name_hash(net, dev->name));
1166         write_unlock_bh(&dev_base_lock);
1167
1168         ret = call_netdevice_notifiers(NETDEV_CHANGENAME, dev);
1169         ret = notifier_to_errno(ret);
1170
1171         if (ret) {
1172                 /* err >= 0 after dev_alloc_name() or stores the first errno */
1173                 if (err >= 0) {
1174                         err = ret;
1175                         write_seqcount_begin(&devnet_rename_seq);
1176                         memcpy(dev->name, oldname, IFNAMSIZ);
1177                         memcpy(oldname, newname, IFNAMSIZ);
1178                         dev->name_assign_type = old_assign_type;
1179                         old_assign_type = NET_NAME_RENAMED;
1180                         goto rollback;
1181                 } else {
1182                         pr_err("%s: name change rollback failed: %d\n",
1183                                dev->name, ret);
1184                 }
1185         }
1186
1187         return err;
1188 }
1189
1190 /**
1191  *      dev_set_alias - change ifalias of a device
1192  *      @dev: device
1193  *      @alias: name up to IFALIASZ
1194  *      @len: limit of bytes to copy from info
1195  *
1196  *      Set ifalias for a device,
1197  */
1198 int dev_set_alias(struct net_device *dev, const char *alias, size_t len)
1199 {
1200         char *new_ifalias;
1201
1202         ASSERT_RTNL();
1203
1204         if (len >= IFALIASZ)
1205                 return -EINVAL;
1206
1207         if (!len) {
1208                 kfree(dev->ifalias);
1209                 dev->ifalias = NULL;
1210                 return 0;
1211         }
1212
1213         new_ifalias = krealloc(dev->ifalias, len + 1, GFP_KERNEL);
1214         if (!new_ifalias)
1215                 return -ENOMEM;
1216         dev->ifalias = new_ifalias;
1217
1218         strlcpy(dev->ifalias, alias, len+1);
1219         return len;
1220 }
1221
1222
1223 /**
1224  *      netdev_features_change - device changes features
1225  *      @dev: device to cause notification
1226  *
1227  *      Called to indicate a device has changed features.
1228  */
1229 void netdev_features_change(struct net_device *dev)
1230 {
1231         call_netdevice_notifiers(NETDEV_FEAT_CHANGE, dev);
1232 }
1233 EXPORT_SYMBOL(netdev_features_change);
1234
1235 /**
1236  *      netdev_state_change - device changes state
1237  *      @dev: device to cause notification
1238  *
1239  *      Called to indicate a device has changed state. This function calls
1240  *      the notifier chains for netdev_chain and sends a NEWLINK message
1241  *      to the routing socket.
1242  */
1243 void netdev_state_change(struct net_device *dev)
1244 {
1245         if (dev->flags & IFF_UP) {
1246                 struct netdev_notifier_change_info change_info;
1247
1248                 change_info.flags_changed = 0;
1249                 call_netdevice_notifiers_info(NETDEV_CHANGE, dev,
1250                                               &change_info.info);
1251                 rtmsg_ifinfo(RTM_NEWLINK, dev, 0, GFP_KERNEL);
1252         }
1253 }
1254 EXPORT_SYMBOL(netdev_state_change);
1255
1256 /**
1257  *      netdev_notify_peers - notify network peers about existence of @dev
1258  *      @dev: network device
1259  *
1260  * Generate traffic such that interested network peers are aware of
1261  * @dev, such as by generating a gratuitous ARP. This may be used when
1262  * a device wants to inform the rest of the network about some sort of
1263  * reconfiguration such as a failover event or virtual machine
1264  * migration.
1265  */
1266 void netdev_notify_peers(struct net_device *dev)
1267 {
1268         rtnl_lock();
1269         call_netdevice_notifiers(NETDEV_NOTIFY_PEERS, dev);
1270         rtnl_unlock();
1271 }
1272 EXPORT_SYMBOL(netdev_notify_peers);
1273
1274 static int __dev_open(struct net_device *dev)
1275 {
1276         const struct net_device_ops *ops = dev->netdev_ops;
1277         int ret;
1278
1279         ASSERT_RTNL();
1280
1281         if (!netif_device_present(dev))
1282                 return -ENODEV;
1283
1284         /* Block netpoll from trying to do any rx path servicing.
1285          * If we don't do this there is a chance ndo_poll_controller
1286          * or ndo_poll may be running while we open the device
1287          */
1288         netpoll_poll_disable(dev);
1289
1290         ret = call_netdevice_notifiers(NETDEV_PRE_UP, dev);
1291         ret = notifier_to_errno(ret);
1292         if (ret)
1293                 return ret;
1294
1295         set_bit(__LINK_STATE_START, &dev->state);
1296
1297         if (ops->ndo_validate_addr)
1298                 ret = ops->ndo_validate_addr(dev);
1299
1300         if (!ret && ops->ndo_open)
1301                 ret = ops->ndo_open(dev);
1302
1303         netpoll_poll_enable(dev);
1304
1305         if (ret)
1306                 clear_bit(__LINK_STATE_START, &dev->state);
1307         else {
1308                 dev->flags |= IFF_UP;
1309                 dev_set_rx_mode(dev);
1310                 dev_activate(dev);
1311                 add_device_randomness(dev->dev_addr, dev->addr_len);
1312         }
1313
1314         return ret;
1315 }
1316
1317 /**
1318  *      dev_open        - prepare an interface for use.
1319  *      @dev:   device to open
1320  *
1321  *      Takes a device from down to up state. The device's private open
1322  *      function is invoked and then the multicast lists are loaded. Finally
1323  *      the device is moved into the up state and a %NETDEV_UP message is
1324  *      sent to the netdev notifier chain.
1325  *
1326  *      Calling this function on an active interface is a nop. On a failure
1327  *      a negative errno code is returned.
1328  */
1329 int dev_open(struct net_device *dev)
1330 {
1331         int ret;
1332
1333         if (dev->flags & IFF_UP)
1334                 return 0;
1335
1336         ret = __dev_open(dev);
1337         if (ret < 0)
1338                 return ret;
1339
1340         rtmsg_ifinfo(RTM_NEWLINK, dev, IFF_UP|IFF_RUNNING, GFP_KERNEL);
1341         call_netdevice_notifiers(NETDEV_UP, dev);
1342
1343         return ret;
1344 }
1345 EXPORT_SYMBOL(dev_open);
1346
1347 static int __dev_close_many(struct list_head *head)
1348 {
1349         struct net_device *dev;
1350
1351         ASSERT_RTNL();
1352         might_sleep();
1353
1354         list_for_each_entry(dev, head, close_list) {
1355                 /* Temporarily disable netpoll until the interface is down */
1356                 netpoll_poll_disable(dev);
1357
1358                 call_netdevice_notifiers(NETDEV_GOING_DOWN, dev);
1359
1360                 clear_bit(__LINK_STATE_START, &dev->state);
1361
1362                 /* Synchronize to scheduled poll. We cannot touch poll list, it
1363                  * can be even on different cpu. So just clear netif_running().
1364                  *
1365                  * dev->stop() will invoke napi_disable() on all of it's
1366                  * napi_struct instances on this device.
1367                  */
1368                 smp_mb__after_atomic(); /* Commit netif_running(). */
1369         }
1370
1371         dev_deactivate_many(head);
1372
1373         list_for_each_entry(dev, head, close_list) {
1374                 const struct net_device_ops *ops = dev->netdev_ops;
1375
1376                 /*
1377                  *      Call the device specific close. This cannot fail.
1378                  *      Only if device is UP
1379                  *
1380                  *      We allow it to be called even after a DETACH hot-plug
1381                  *      event.
1382                  */
1383                 if (ops->ndo_stop)
1384                         ops->ndo_stop(dev);
1385
1386                 dev->flags &= ~IFF_UP;
1387                 netpoll_poll_enable(dev);
1388         }
1389
1390         return 0;
1391 }
1392
1393 static int __dev_close(struct net_device *dev)
1394 {
1395         int retval;
1396         LIST_HEAD(single);
1397
1398         list_add(&dev->close_list, &single);
1399         retval = __dev_close_many(&single);
1400         list_del(&single);
1401
1402         return retval;
1403 }
1404
1405 int dev_close_many(struct list_head *head, bool unlink)
1406 {
1407         struct net_device *dev, *tmp;
1408
1409         /* Remove the devices that don't need to be closed */
1410         list_for_each_entry_safe(dev, tmp, head, close_list)
1411                 if (!(dev->flags & IFF_UP))
1412                         list_del_init(&dev->close_list);
1413
1414         __dev_close_many(head);
1415
1416         list_for_each_entry_safe(dev, tmp, head, close_list) {
1417                 rtmsg_ifinfo(RTM_NEWLINK, dev, IFF_UP|IFF_RUNNING, GFP_KERNEL);
1418                 call_netdevice_notifiers(NETDEV_DOWN, dev);
1419                 if (unlink)
1420                         list_del_init(&dev->close_list);
1421         }
1422
1423         return 0;
1424 }
1425 EXPORT_SYMBOL(dev_close_many);
1426
1427 /**
1428  *      dev_close - shutdown an interface.
1429  *      @dev: device to shutdown
1430  *
1431  *      This function moves an active device into down state. A
1432  *      %NETDEV_GOING_DOWN is sent to the netdev notifier chain. The device
1433  *      is then deactivated and finally a %NETDEV_DOWN is sent to the notifier
1434  *      chain.
1435  */
1436 int dev_close(struct net_device *dev)
1437 {
1438         if (dev->flags & IFF_UP) {
1439                 LIST_HEAD(single);
1440
1441                 list_add(&dev->close_list, &single);
1442                 dev_close_many(&single, true);
1443                 list_del(&single);
1444         }
1445         return 0;
1446 }
1447 EXPORT_SYMBOL(dev_close);
1448
1449
1450 /**
1451  *      dev_disable_lro - disable Large Receive Offload on a device
1452  *      @dev: device
1453  *
1454  *      Disable Large Receive Offload (LRO) on a net device.  Must be
1455  *      called under RTNL.  This is needed if received packets may be
1456  *      forwarded to another interface.
1457  */
1458 void dev_disable_lro(struct net_device *dev)
1459 {
1460         struct net_device *lower_dev;
1461         struct list_head *iter;
1462
1463         dev->wanted_features &= ~NETIF_F_LRO;
1464         netdev_update_features(dev);
1465
1466         if (unlikely(dev->features & NETIF_F_LRO))
1467                 netdev_WARN(dev, "failed to disable LRO!\n");
1468
1469         netdev_for_each_lower_dev(dev, lower_dev, iter)
1470                 dev_disable_lro(lower_dev);
1471 }
1472 EXPORT_SYMBOL(dev_disable_lro);
1473
1474 static int call_netdevice_notifier(struct notifier_block *nb, unsigned long val,
1475                                    struct net_device *dev)
1476 {
1477         struct netdev_notifier_info info;
1478
1479         netdev_notifier_info_init(&info, dev);
1480         return nb->notifier_call(nb, val, &info);
1481 }
1482
1483 static int dev_boot_phase = 1;
1484
1485 /**
1486  *      register_netdevice_notifier - register a network notifier block
1487  *      @nb: notifier
1488  *
1489  *      Register a notifier to be called when network device events occur.
1490  *      The notifier passed is linked into the kernel structures and must
1491  *      not be reused until it has been unregistered. A negative errno code
1492  *      is returned on a failure.
1493  *
1494  *      When registered all registration and up events are replayed
1495  *      to the new notifier to allow device to have a race free
1496  *      view of the network device list.
1497  */
1498
1499 int register_netdevice_notifier(struct notifier_block *nb)
1500 {
1501         struct net_device *dev;
1502         struct net_device *last;
1503         struct net *net;
1504         int err;
1505
1506         rtnl_lock();
1507         err = raw_notifier_chain_register(&netdev_chain, nb);
1508         if (err)
1509                 goto unlock;
1510         if (dev_boot_phase)
1511                 goto unlock;
1512         for_each_net(net) {
1513                 for_each_netdev(net, dev) {
1514                         err = call_netdevice_notifier(nb, NETDEV_REGISTER, dev);
1515                         err = notifier_to_errno(err);
1516                         if (err)
1517                                 goto rollback;
1518
1519                         if (!(dev->flags & IFF_UP))
1520                                 continue;
1521
1522                         call_netdevice_notifier(nb, NETDEV_UP, dev);
1523                 }
1524         }
1525
1526 unlock:
1527         rtnl_unlock();
1528         return err;
1529
1530 rollback:
1531         last = dev;
1532         for_each_net(net) {
1533                 for_each_netdev(net, dev) {
1534                         if (dev == last)
1535                                 goto outroll;
1536
1537                         if (dev->flags & IFF_UP) {
1538                                 call_netdevice_notifier(nb, NETDEV_GOING_DOWN,
1539                                                         dev);
1540                                 call_netdevice_notifier(nb, NETDEV_DOWN, dev);
1541                         }
1542                         call_netdevice_notifier(nb, NETDEV_UNREGISTER, dev);
1543                 }
1544         }
1545
1546 outroll:
1547         raw_notifier_chain_unregister(&netdev_chain, nb);
1548         goto unlock;
1549 }
1550 EXPORT_SYMBOL(register_netdevice_notifier);
1551
1552 /**
1553  *      unregister_netdevice_notifier - unregister a network notifier block
1554  *      @nb: notifier
1555  *
1556  *      Unregister a notifier previously registered by
1557  *      register_netdevice_notifier(). The notifier is unlinked into the
1558  *      kernel structures and may then be reused. A negative errno code
1559  *      is returned on a failure.
1560  *
1561  *      After unregistering unregister and down device events are synthesized
1562  *      for all devices on the device list to the removed notifier to remove
1563  *      the need for special case cleanup code.
1564  */
1565
1566 int unregister_netdevice_notifier(struct notifier_block *nb)
1567 {
1568         struct net_device *dev;
1569         struct net *net;
1570         int err;
1571
1572         rtnl_lock();
1573         err = raw_notifier_chain_unregister(&netdev_chain, nb);
1574         if (err)
1575                 goto unlock;
1576
1577         for_each_net(net) {
1578                 for_each_netdev(net, dev) {
1579                         if (dev->flags & IFF_UP) {
1580                                 call_netdevice_notifier(nb, NETDEV_GOING_DOWN,
1581                                                         dev);
1582                                 call_netdevice_notifier(nb, NETDEV_DOWN, dev);
1583                         }
1584                         call_netdevice_notifier(nb, NETDEV_UNREGISTER, dev);
1585                 }
1586         }
1587 unlock:
1588         rtnl_unlock();
1589         return err;
1590 }
1591 EXPORT_SYMBOL(unregister_netdevice_notifier);
1592
1593 /**
1594  *      call_netdevice_notifiers_info - call all network notifier blocks
1595  *      @val: value passed unmodified to notifier function
1596  *      @dev: net_device pointer passed unmodified to notifier function
1597  *      @info: notifier information data
1598  *
1599  *      Call all network notifier blocks.  Parameters and return value
1600  *      are as for raw_notifier_call_chain().
1601  */
1602
1603 static int call_netdevice_notifiers_info(unsigned long val,
1604                                          struct net_device *dev,
1605                                          struct netdev_notifier_info *info)
1606 {
1607         ASSERT_RTNL();
1608         netdev_notifier_info_init(info, dev);
1609         return raw_notifier_call_chain(&netdev_chain, val, info);
1610 }
1611
1612 /**
1613  *      call_netdevice_notifiers - call all network notifier blocks
1614  *      @val: value passed unmodified to notifier function
1615  *      @dev: net_device pointer passed unmodified to notifier function
1616  *
1617  *      Call all network notifier blocks.  Parameters and return value
1618  *      are as for raw_notifier_call_chain().
1619  */
1620
1621 int call_netdevice_notifiers(unsigned long val, struct net_device *dev)
1622 {
1623         struct netdev_notifier_info info;
1624
1625         return call_netdevice_notifiers_info(val, dev, &info);
1626 }
1627 EXPORT_SYMBOL(call_netdevice_notifiers);
1628
1629 #ifdef CONFIG_NET_CLS_ACT
1630 static struct static_key ingress_needed __read_mostly;
1631
1632 void net_inc_ingress_queue(void)
1633 {
1634         static_key_slow_inc(&ingress_needed);
1635 }
1636 EXPORT_SYMBOL_GPL(net_inc_ingress_queue);
1637
1638 void net_dec_ingress_queue(void)
1639 {
1640         static_key_slow_dec(&ingress_needed);
1641 }
1642 EXPORT_SYMBOL_GPL(net_dec_ingress_queue);
1643 #endif
1644
1645 static struct static_key netstamp_needed __read_mostly;
1646 #ifdef HAVE_JUMP_LABEL
1647 /* We are not allowed to call static_key_slow_dec() from irq context
1648  * If net_disable_timestamp() is called from irq context, defer the
1649  * static_key_slow_dec() calls.
1650  */
1651 static atomic_t netstamp_needed_deferred;
1652 #endif
1653
1654 void net_enable_timestamp(void)
1655 {
1656 #ifdef HAVE_JUMP_LABEL
1657         int deferred = atomic_xchg(&netstamp_needed_deferred, 0);
1658
1659         if (deferred) {
1660                 while (--deferred)
1661                         static_key_slow_dec(&netstamp_needed);
1662                 return;
1663         }
1664 #endif
1665         static_key_slow_inc(&netstamp_needed);
1666 }
1667 EXPORT_SYMBOL(net_enable_timestamp);
1668
1669 void net_disable_timestamp(void)
1670 {
1671 #ifdef HAVE_JUMP_LABEL
1672         if (in_interrupt()) {
1673                 atomic_inc(&netstamp_needed_deferred);
1674                 return;
1675         }
1676 #endif
1677         static_key_slow_dec(&netstamp_needed);
1678 }
1679 EXPORT_SYMBOL(net_disable_timestamp);
1680
1681 static inline void net_timestamp_set(struct sk_buff *skb)
1682 {
1683         skb->tstamp.tv64 = 0;
1684         if (static_key_false(&netstamp_needed))
1685                 __net_timestamp(skb);
1686 }
1687
1688 #define net_timestamp_check(COND, SKB)                  \
1689         if (static_key_false(&netstamp_needed)) {               \
1690                 if ((COND) && !(SKB)->tstamp.tv64)      \
1691                         __net_timestamp(SKB);           \
1692         }                                               \
1693
1694 bool is_skb_forwardable(struct net_device *dev, struct sk_buff *skb)
1695 {
1696         unsigned int len;
1697
1698         if (!(dev->flags & IFF_UP))
1699                 return false;
1700
1701         len = dev->mtu + dev->hard_header_len + VLAN_HLEN;
1702         if (skb->len <= len)
1703                 return true;
1704
1705         /* if TSO is enabled, we don't care about the length as the packet
1706          * could be forwarded without being segmented before
1707          */
1708         if (skb_is_gso(skb))
1709                 return true;
1710
1711         return false;
1712 }
1713 EXPORT_SYMBOL_GPL(is_skb_forwardable);
1714
1715 int __dev_forward_skb(struct net_device *dev, struct sk_buff *skb)
1716 {
1717         if (skb_orphan_frags(skb, GFP_ATOMIC) ||
1718             unlikely(!is_skb_forwardable(dev, skb))) {
1719                 atomic_long_inc(&dev->rx_dropped);
1720                 kfree_skb(skb);
1721                 return NET_RX_DROP;
1722         }
1723
1724         skb_scrub_packet(skb, true);
1725         skb->priority = 0;
1726         skb->protocol = eth_type_trans(skb, dev);
1727         skb_postpull_rcsum(skb, eth_hdr(skb), ETH_HLEN);
1728
1729         return 0;
1730 }
1731 EXPORT_SYMBOL_GPL(__dev_forward_skb);
1732
1733 /**
1734  * dev_forward_skb - loopback an skb to another netif
1735  *
1736  * @dev: destination network device
1737  * @skb: buffer to forward
1738  *
1739  * return values:
1740  *      NET_RX_SUCCESS  (no congestion)
1741  *      NET_RX_DROP     (packet was dropped, but freed)
1742  *
1743  * dev_forward_skb can be used for injecting an skb from the
1744  * start_xmit function of one device into the receive queue
1745  * of another device.
1746  *
1747  * The receiving device may be in another namespace, so
1748  * we have to clear all information in the skb that could
1749  * impact namespace isolation.
1750  */
1751 int dev_forward_skb(struct net_device *dev, struct sk_buff *skb)
1752 {
1753         return __dev_forward_skb(dev, skb) ?: netif_rx_internal(skb);
1754 }
1755 EXPORT_SYMBOL_GPL(dev_forward_skb);
1756
1757 static inline int deliver_skb(struct sk_buff *skb,
1758                               struct packet_type *pt_prev,
1759                               struct net_device *orig_dev)
1760 {
1761         if (unlikely(skb_orphan_frags(skb, GFP_ATOMIC)))
1762                 return -ENOMEM;
1763         atomic_inc(&skb->users);
1764         return pt_prev->func(skb, skb->dev, pt_prev, orig_dev);
1765 }
1766
1767 static inline void deliver_ptype_list_skb(struct sk_buff *skb,
1768                                           struct packet_type **pt,
1769                                           struct net_device *orig_dev,
1770                                           __be16 type,
1771                                           struct list_head *ptype_list)
1772 {
1773         struct packet_type *ptype, *pt_prev = *pt;
1774
1775         list_for_each_entry_rcu(ptype, ptype_list, list) {
1776                 if (ptype->type != type)
1777                         continue;
1778                 if (pt_prev)
1779                         deliver_skb(skb, pt_prev, orig_dev);
1780                 pt_prev = ptype;
1781         }
1782         *pt = pt_prev;
1783 }
1784
1785 static inline bool skb_loop_sk(struct packet_type *ptype, struct sk_buff *skb)
1786 {
1787         if (!ptype->af_packet_priv || !skb->sk)
1788                 return false;
1789
1790         if (ptype->id_match)
1791                 return ptype->id_match(ptype, skb->sk);
1792         else if ((struct sock *)ptype->af_packet_priv == skb->sk)
1793                 return true;
1794
1795         return false;
1796 }
1797
1798 /*
1799  *      Support routine. Sends outgoing frames to any network
1800  *      taps currently in use.
1801  */
1802
1803 static void dev_queue_xmit_nit(struct sk_buff *skb, struct net_device *dev)
1804 {
1805         struct packet_type *ptype;
1806         struct sk_buff *skb2 = NULL;
1807         struct packet_type *pt_prev = NULL;
1808         struct list_head *ptype_list = &ptype_all;
1809
1810         rcu_read_lock();
1811 again:
1812         list_for_each_entry_rcu(ptype, ptype_list, list) {
1813                 /* Never send packets back to the socket
1814                  * they originated from - MvS (miquels@drinkel.ow.org)
1815                  */
1816                 if (skb_loop_sk(ptype, skb))
1817                         continue;
1818
1819                 if (pt_prev) {
1820                         deliver_skb(skb2, pt_prev, skb->dev);
1821                         pt_prev = ptype;
1822                         continue;
1823                 }
1824
1825                 /* need to clone skb, done only once */
1826                 skb2 = skb_clone(skb, GFP_ATOMIC);
1827                 if (!skb2)
1828                         goto out_unlock;
1829
1830                 net_timestamp_set(skb2);
1831
1832                 /* skb->nh should be correctly
1833                  * set by sender, so that the second statement is
1834                  * just protection against buggy protocols.
1835                  */
1836                 skb_reset_mac_header(skb2);
1837
1838                 if (skb_network_header(skb2) < skb2->data ||
1839                     skb_network_header(skb2) > skb_tail_pointer(skb2)) {
1840                         net_crit_ratelimited("protocol %04x is buggy, dev %s\n",
1841                                              ntohs(skb2->protocol),
1842                                              dev->name);
1843                         skb_reset_network_header(skb2);
1844                 }
1845
1846                 skb2->transport_header = skb2->network_header;
1847                 skb2->pkt_type = PACKET_OUTGOING;
1848                 pt_prev = ptype;
1849         }
1850
1851         if (ptype_list == &ptype_all) {
1852                 ptype_list = &dev->ptype_all;
1853                 goto again;
1854         }
1855 out_unlock:
1856         if (pt_prev)
1857                 pt_prev->func(skb2, skb->dev, pt_prev, skb->dev);
1858         rcu_read_unlock();
1859 }
1860
1861 /**
1862  * netif_setup_tc - Handle tc mappings on real_num_tx_queues change
1863  * @dev: Network device
1864  * @txq: number of queues available
1865  *
1866  * If real_num_tx_queues is changed the tc mappings may no longer be
1867  * valid. To resolve this verify the tc mapping remains valid and if
1868  * not NULL the mapping. With no priorities mapping to this
1869  * offset/count pair it will no longer be used. In the worst case TC0
1870  * is invalid nothing can be done so disable priority mappings. If is
1871  * expected that drivers will fix this mapping if they can before
1872  * calling netif_set_real_num_tx_queues.
1873  */
1874 static void netif_setup_tc(struct net_device *dev, unsigned int txq)
1875 {
1876         int i;
1877         struct netdev_tc_txq *tc = &dev->tc_to_txq[0];
1878
1879         /* If TC0 is invalidated disable TC mapping */
1880         if (tc->offset + tc->count > txq) {
1881                 pr_warn("Number of in use tx queues changed invalidating tc mappings. Priority traffic classification disabled!\n");
1882                 dev->num_tc = 0;
1883                 return;
1884         }
1885
1886         /* Invalidated prio to tc mappings set to TC0 */
1887         for (i = 1; i < TC_BITMASK + 1; i++) {
1888                 int q = netdev_get_prio_tc_map(dev, i);
1889
1890                 tc = &dev->tc_to_txq[q];
1891                 if (tc->offset + tc->count > txq) {
1892                         pr_warn("Number of in use tx queues changed. Priority %i to tc mapping %i is no longer valid. Setting map to 0\n",
1893                                 i, q);
1894                         netdev_set_prio_tc_map(dev, i, 0);
1895                 }
1896         }
1897 }
1898
1899 #ifdef CONFIG_XPS
1900 static DEFINE_MUTEX(xps_map_mutex);
1901 #define xmap_dereference(P)             \
1902         rcu_dereference_protected((P), lockdep_is_held(&xps_map_mutex))
1903
1904 static struct xps_map *remove_xps_queue(struct xps_dev_maps *dev_maps,
1905                                         int cpu, u16 index)
1906 {
1907         struct xps_map *map = NULL;
1908         int pos;
1909
1910         if (dev_maps)
1911                 map = xmap_dereference(dev_maps->cpu_map[cpu]);
1912
1913         for (pos = 0; map && pos < map->len; pos++) {
1914                 if (map->queues[pos] == index) {
1915                         if (map->len > 1) {
1916                                 map->queues[pos] = map->queues[--map->len];
1917                         } else {
1918                                 RCU_INIT_POINTER(dev_maps->cpu_map[cpu], NULL);
1919                                 kfree_rcu(map, rcu);
1920                                 map = NULL;
1921                         }
1922                         break;
1923                 }
1924         }
1925
1926         return map;
1927 }
1928
1929 static void netif_reset_xps_queues_gt(struct net_device *dev, u16 index)
1930 {
1931         struct xps_dev_maps *dev_maps;
1932         int cpu, i;
1933         bool active = false;
1934
1935         mutex_lock(&xps_map_mutex);
1936         dev_maps = xmap_dereference(dev->xps_maps);
1937
1938         if (!dev_maps)
1939                 goto out_no_maps;
1940
1941         for_each_possible_cpu(cpu) {
1942                 for (i = index; i < dev->num_tx_queues; i++) {
1943                         if (!remove_xps_queue(dev_maps, cpu, i))
1944                                 break;
1945                 }
1946                 if (i == dev->num_tx_queues)
1947                         active = true;
1948         }
1949
1950         if (!active) {
1951                 RCU_INIT_POINTER(dev->xps_maps, NULL);
1952                 kfree_rcu(dev_maps, rcu);
1953         }
1954
1955         for (i = index; i < dev->num_tx_queues; i++)
1956                 netdev_queue_numa_node_write(netdev_get_tx_queue(dev, i),
1957                                              NUMA_NO_NODE);
1958
1959 out_no_maps:
1960         mutex_unlock(&xps_map_mutex);
1961 }
1962
1963 static struct xps_map *expand_xps_map(struct xps_map *map,
1964                                       int cpu, u16 index)
1965 {
1966         struct xps_map *new_map;
1967         int alloc_len = XPS_MIN_MAP_ALLOC;
1968         int i, pos;
1969
1970         for (pos = 0; map && pos < map->len; pos++) {
1971                 if (map->queues[pos] != index)
1972                         continue;
1973                 return map;
1974         }
1975
1976         /* Need to add queue to this CPU's existing map */
1977         if (map) {
1978                 if (pos < map->alloc_len)
1979                         return map;
1980
1981                 alloc_len = map->alloc_len * 2;
1982         }
1983
1984         /* Need to allocate new map to store queue on this CPU's map */
1985         new_map = kzalloc_node(XPS_MAP_SIZE(alloc_len), GFP_KERNEL,
1986                                cpu_to_node(cpu));
1987         if (!new_map)
1988                 return NULL;
1989
1990         for (i = 0; i < pos; i++)
1991                 new_map->queues[i] = map->queues[i];
1992         new_map->alloc_len = alloc_len;
1993         new_map->len = pos;
1994
1995         return new_map;
1996 }
1997
1998 int netif_set_xps_queue(struct net_device *dev, const struct cpumask *mask,
1999                         u16 index)
2000 {
2001         struct xps_dev_maps *dev_maps, *new_dev_maps = NULL;
2002         struct xps_map *map, *new_map;
2003         int maps_sz = max_t(unsigned int, XPS_DEV_MAPS_SIZE, L1_CACHE_BYTES);
2004         int cpu, numa_node_id = -2;
2005         bool active = false;
2006
2007         mutex_lock(&xps_map_mutex);
2008
2009         dev_maps = xmap_dereference(dev->xps_maps);
2010
2011         /* allocate memory for queue storage */
2012         for_each_online_cpu(cpu) {
2013                 if (!cpumask_test_cpu(cpu, mask))
2014                         continue;
2015
2016                 if (!new_dev_maps)
2017                         new_dev_maps = kzalloc(maps_sz, GFP_KERNEL);
2018                 if (!new_dev_maps) {
2019                         mutex_unlock(&xps_map_mutex);
2020                         return -ENOMEM;
2021                 }
2022
2023                 map = dev_maps ? xmap_dereference(dev_maps->cpu_map[cpu]) :
2024                                  NULL;
2025
2026                 map = expand_xps_map(map, cpu, index);
2027                 if (!map)
2028                         goto error;
2029
2030                 RCU_INIT_POINTER(new_dev_maps->cpu_map[cpu], map);
2031         }
2032
2033         if (!new_dev_maps)
2034                 goto out_no_new_maps;
2035
2036         for_each_possible_cpu(cpu) {
2037                 if (cpumask_test_cpu(cpu, mask) && cpu_online(cpu)) {
2038                         /* add queue to CPU maps */
2039                         int pos = 0;
2040
2041                         map = xmap_dereference(new_dev_maps->cpu_map[cpu]);
2042                         while ((pos < map->len) && (map->queues[pos] != index))
2043                                 pos++;
2044
2045                         if (pos == map->len)
2046                                 map->queues[map->len++] = index;
2047 #ifdef CONFIG_NUMA
2048                         if (numa_node_id == -2)
2049                                 numa_node_id = cpu_to_node(cpu);
2050                         else if (numa_node_id != cpu_to_node(cpu))
2051                                 numa_node_id = -1;
2052 #endif
2053                 } else if (dev_maps) {
2054                         /* fill in the new device map from the old device map */
2055                         map = xmap_dereference(dev_maps->cpu_map[cpu]);
2056                         RCU_INIT_POINTER(new_dev_maps->cpu_map[cpu], map);
2057                 }
2058
2059         }
2060
2061         rcu_assign_pointer(dev->xps_maps, new_dev_maps);
2062
2063         /* Cleanup old maps */
2064         if (dev_maps) {
2065                 for_each_possible_cpu(cpu) {
2066                         new_map = xmap_dereference(new_dev_maps->cpu_map[cpu]);
2067                         map = xmap_dereference(dev_maps->cpu_map[cpu]);
2068                         if (map && map != new_map)
2069                                 kfree_rcu(map, rcu);
2070                 }
2071
2072                 kfree_rcu(dev_maps, rcu);
2073         }
2074
2075         dev_maps = new_dev_maps;
2076         active = true;
2077
2078 out_no_new_maps:
2079         /* update Tx queue numa node */
2080         netdev_queue_numa_node_write(netdev_get_tx_queue(dev, index),
2081                                      (numa_node_id >= 0) ? numa_node_id :
2082                                      NUMA_NO_NODE);
2083
2084         if (!dev_maps)
2085                 goto out_no_maps;
2086
2087         /* removes queue from unused CPUs */
2088         for_each_possible_cpu(cpu) {
2089                 if (cpumask_test_cpu(cpu, mask) && cpu_online(cpu))
2090                         continue;
2091
2092                 if (remove_xps_queue(dev_maps, cpu, index))
2093                         active = true;
2094         }
2095
2096         /* free map if not active */
2097         if (!active) {
2098                 RCU_INIT_POINTER(dev->xps_maps, NULL);
2099                 kfree_rcu(dev_maps, rcu);
2100         }
2101
2102 out_no_maps:
2103         mutex_unlock(&xps_map_mutex);
2104
2105         return 0;
2106 error:
2107         /* remove any maps that we added */
2108         for_each_possible_cpu(cpu) {
2109                 new_map = xmap_dereference(new_dev_maps->cpu_map[cpu]);
2110                 map = dev_maps ? xmap_dereference(dev_maps->cpu_map[cpu]) :
2111                                  NULL;
2112                 if (new_map && new_map != map)
2113                         kfree(new_map);
2114         }
2115
2116         mutex_unlock(&xps_map_mutex);
2117
2118         kfree(new_dev_maps);
2119         return -ENOMEM;
2120 }
2121 EXPORT_SYMBOL(netif_set_xps_queue);
2122
2123 #endif
2124 /*
2125  * Routine to help set real_num_tx_queues. To avoid skbs mapped to queues
2126  * greater then real_num_tx_queues stale skbs on the qdisc must be flushed.
2127  */
2128 int netif_set_real_num_tx_queues(struct net_device *dev, unsigned int txq)
2129 {
2130         int rc;
2131
2132         if (txq < 1 || txq > dev->num_tx_queues)
2133                 return -EINVAL;
2134
2135         if (dev->reg_state == NETREG_REGISTERED ||
2136             dev->reg_state == NETREG_UNREGISTERING) {
2137                 ASSERT_RTNL();
2138
2139                 rc = netdev_queue_update_kobjects(dev, dev->real_num_tx_queues,
2140                                                   txq);
2141                 if (rc)
2142                         return rc;
2143
2144                 if (dev->num_tc)
2145                         netif_setup_tc(dev, txq);
2146
2147                 if (txq < dev->real_num_tx_queues) {
2148                         qdisc_reset_all_tx_gt(dev, txq);
2149 #ifdef CONFIG_XPS
2150                         netif_reset_xps_queues_gt(dev, txq);
2151 #endif
2152                 }
2153         }
2154
2155         dev->real_num_tx_queues = txq;
2156         return 0;
2157 }
2158 EXPORT_SYMBOL(netif_set_real_num_tx_queues);
2159
2160 #ifdef CONFIG_SYSFS
2161 /**
2162  *      netif_set_real_num_rx_queues - set actual number of RX queues used
2163  *      @dev: Network device
2164  *      @rxq: Actual number of RX queues
2165  *
2166  *      This must be called either with the rtnl_lock held or before
2167  *      registration of the net device.  Returns 0 on success, or a
2168  *      negative error code.  If called before registration, it always
2169  *      succeeds.
2170  */
2171 int netif_set_real_num_rx_queues(struct net_device *dev, unsigned int rxq)
2172 {
2173         int rc;
2174
2175         if (rxq < 1 || rxq > dev->num_rx_queues)
2176                 return -EINVAL;
2177
2178         if (dev->reg_state == NETREG_REGISTERED) {
2179                 ASSERT_RTNL();
2180
2181                 rc = net_rx_queue_update_kobjects(dev, dev->real_num_rx_queues,
2182                                                   rxq);
2183                 if (rc)
2184                         return rc;
2185         }
2186
2187         dev->real_num_rx_queues = rxq;
2188         return 0;
2189 }
2190 EXPORT_SYMBOL(netif_set_real_num_rx_queues);
2191 #endif
2192
2193 /**
2194  * netif_get_num_default_rss_queues - default number of RSS queues
2195  *
2196  * This routine should set an upper limit on the number of RSS queues
2197  * used by default by multiqueue devices.
2198  */
2199 int netif_get_num_default_rss_queues(void)
2200 {
2201         return min_t(int, DEFAULT_MAX_NUM_RSS_QUEUES, num_online_cpus());
2202 }
2203 EXPORT_SYMBOL(netif_get_num_default_rss_queues);
2204
2205 static inline void __netif_reschedule(struct Qdisc *q)
2206 {
2207         struct softnet_data *sd;
2208         unsigned long flags;
2209
2210         local_irq_save(flags);
2211         sd = this_cpu_ptr(&softnet_data);
2212         q->next_sched = NULL;
2213         *sd->output_queue_tailp = q;
2214         sd->output_queue_tailp = &q->next_sched;
2215         raise_softirq_irqoff(NET_TX_SOFTIRQ);
2216         local_irq_restore(flags);
2217 }
2218
2219 void __netif_schedule(struct Qdisc *q)
2220 {
2221         if (!test_and_set_bit(__QDISC_STATE_SCHED, &q->state))
2222                 __netif_reschedule(q);
2223 }
2224 EXPORT_SYMBOL(__netif_schedule);
2225
2226 struct dev_kfree_skb_cb {
2227         enum skb_free_reason reason;
2228 };
2229
2230 static struct dev_kfree_skb_cb *get_kfree_skb_cb(const struct sk_buff *skb)
2231 {
2232         return (struct dev_kfree_skb_cb *)skb->cb;
2233 }
2234
2235 void netif_schedule_queue(struct netdev_queue *txq)
2236 {
2237         rcu_read_lock();
2238         if (!(txq->state & QUEUE_STATE_ANY_XOFF)) {
2239                 struct Qdisc *q = rcu_dereference(txq->qdisc);
2240
2241                 __netif_schedule(q);
2242         }
2243         rcu_read_unlock();
2244 }
2245 EXPORT_SYMBOL(netif_schedule_queue);
2246
2247 /**
2248  *      netif_wake_subqueue - allow sending packets on subqueue
2249  *      @dev: network device
2250  *      @queue_index: sub queue index
2251  *
2252  * Resume individual transmit queue of a device with multiple transmit queues.
2253  */
2254 void netif_wake_subqueue(struct net_device *dev, u16 queue_index)
2255 {
2256         struct netdev_queue *txq = netdev_get_tx_queue(dev, queue_index);
2257
2258         if (test_and_clear_bit(__QUEUE_STATE_DRV_XOFF, &txq->state)) {
2259                 struct Qdisc *q;
2260
2261                 rcu_read_lock();
2262                 q = rcu_dereference(txq->qdisc);
2263                 __netif_schedule(q);
2264                 rcu_read_unlock();
2265         }
2266 }
2267 EXPORT_SYMBOL(netif_wake_subqueue);
2268
2269 void netif_tx_wake_queue(struct netdev_queue *dev_queue)
2270 {
2271         if (test_and_clear_bit(__QUEUE_STATE_DRV_XOFF, &dev_queue->state)) {
2272                 struct Qdisc *q;
2273
2274                 rcu_read_lock();
2275                 q = rcu_dereference(dev_queue->qdisc);
2276                 __netif_schedule(q);
2277                 rcu_read_unlock();
2278         }
2279 }
2280 EXPORT_SYMBOL(netif_tx_wake_queue);
2281
2282 void __dev_kfree_skb_irq(struct sk_buff *skb, enum skb_free_reason reason)
2283 {
2284         unsigned long flags;
2285
2286         if (likely(atomic_read(&skb->users) == 1)) {
2287                 smp_rmb();
2288                 atomic_set(&skb->users, 0);
2289         } else if (likely(!atomic_dec_and_test(&skb->users))) {
2290                 return;
2291         }
2292         get_kfree_skb_cb(skb)->reason = reason;
2293         local_irq_save(flags);
2294         skb->next = __this_cpu_read(softnet_data.completion_queue);
2295         __this_cpu_write(softnet_data.completion_queue, skb);
2296         raise_softirq_irqoff(NET_TX_SOFTIRQ);
2297         local_irq_restore(flags);
2298 }
2299 EXPORT_SYMBOL(__dev_kfree_skb_irq);
2300
2301 void __dev_kfree_skb_any(struct sk_buff *skb, enum skb_free_reason reason)
2302 {
2303         if (in_irq() || irqs_disabled())
2304                 __dev_kfree_skb_irq(skb, reason);
2305         else
2306                 dev_kfree_skb(skb);
2307 }
2308 EXPORT_SYMBOL(__dev_kfree_skb_any);
2309
2310
2311 /**
2312  * netif_device_detach - mark device as removed
2313  * @dev: network device
2314  *
2315  * Mark device as removed from system and therefore no longer available.
2316  */
2317 void netif_device_detach(struct net_device *dev)
2318 {
2319         if (test_and_clear_bit(__LINK_STATE_PRESENT, &dev->state) &&
2320             netif_running(dev)) {
2321                 netif_tx_stop_all_queues(dev);
2322         }
2323 }
2324 EXPORT_SYMBOL(netif_device_detach);
2325
2326 /**
2327  * netif_device_attach - mark device as attached
2328  * @dev: network device
2329  *
2330  * Mark device as attached from system and restart if needed.
2331  */
2332 void netif_device_attach(struct net_device *dev)
2333 {
2334         if (!test_and_set_bit(__LINK_STATE_PRESENT, &dev->state) &&
2335             netif_running(dev)) {
2336                 netif_tx_wake_all_queues(dev);
2337                 __netdev_watchdog_up(dev);
2338         }
2339 }
2340 EXPORT_SYMBOL(netif_device_attach);
2341
2342 static void skb_warn_bad_offload(const struct sk_buff *skb)
2343 {
2344         static const netdev_features_t null_features = 0;
2345         struct net_device *dev = skb->dev;
2346         const char *driver = "";
2347
2348         if (!net_ratelimit())
2349                 return;
2350
2351         if (dev && dev->dev.parent)
2352                 driver = dev_driver_string(dev->dev.parent);
2353
2354         WARN(1, "%s: caps=(%pNF, %pNF) len=%d data_len=%d gso_size=%d "
2355              "gso_type=%d ip_summed=%d\n",
2356              driver, dev ? &dev->features : &null_features,
2357              skb->sk ? &skb->sk->sk_route_caps : &null_features,
2358              skb->len, skb->data_len, skb_shinfo(skb)->gso_size,
2359              skb_shinfo(skb)->gso_type, skb->ip_summed);
2360 }
2361
2362 /*
2363  * Invalidate hardware checksum when packet is to be mangled, and
2364  * complete checksum manually on outgoing path.
2365  */
2366 int skb_checksum_help(struct sk_buff *skb)
2367 {
2368         __wsum csum;
2369         int ret = 0, offset;
2370
2371         if (skb->ip_summed == CHECKSUM_COMPLETE)
2372                 goto out_set_summed;
2373
2374         if (unlikely(skb_shinfo(skb)->gso_size)) {
2375                 skb_warn_bad_offload(skb);
2376                 return -EINVAL;
2377         }
2378
2379         /* Before computing a checksum, we should make sure no frag could
2380          * be modified by an external entity : checksum could be wrong.
2381          */
2382         if (skb_has_shared_frag(skb)) {
2383                 ret = __skb_linearize(skb);
2384                 if (ret)
2385                         goto out;
2386         }
2387
2388         offset = skb_checksum_start_offset(skb);
2389         BUG_ON(offset >= skb_headlen(skb));
2390         csum = skb_checksum(skb, offset, skb->len - offset, 0);
2391
2392         offset += skb->csum_offset;
2393         BUG_ON(offset + sizeof(__sum16) > skb_headlen(skb));
2394
2395         if (skb_cloned(skb) &&
2396             !skb_clone_writable(skb, offset + sizeof(__sum16))) {
2397                 ret = pskb_expand_head(skb, 0, 0, GFP_ATOMIC);
2398                 if (ret)
2399                         goto out;
2400         }
2401
2402         *(__sum16 *)(skb->data + offset) = csum_fold(csum);
2403 out_set_summed:
2404         skb->ip_summed = CHECKSUM_NONE;
2405 out:
2406         return ret;
2407 }
2408 EXPORT_SYMBOL(skb_checksum_help);
2409
2410 __be16 skb_network_protocol(struct sk_buff *skb, int *depth)
2411 {
2412         __be16 type = skb->protocol;
2413
2414         /* Tunnel gso handlers can set protocol to ethernet. */
2415         if (type == htons(ETH_P_TEB)) {
2416                 struct ethhdr *eth;
2417
2418                 if (unlikely(!pskb_may_pull(skb, sizeof(struct ethhdr))))
2419                         return 0;
2420
2421                 eth = (struct ethhdr *)skb_mac_header(skb);
2422                 type = eth->h_proto;
2423         }
2424
2425         return __vlan_get_protocol(skb, type, depth);
2426 }
2427
2428 /**
2429  *      skb_mac_gso_segment - mac layer segmentation handler.
2430  *      @skb: buffer to segment
2431  *      @features: features for the output path (see dev->features)
2432  */
2433 struct sk_buff *skb_mac_gso_segment(struct sk_buff *skb,
2434                                     netdev_features_t features)
2435 {
2436         struct sk_buff *segs = ERR_PTR(-EPROTONOSUPPORT);
2437         struct packet_offload *ptype;
2438         int vlan_depth = skb->mac_len;
2439         __be16 type = skb_network_protocol(skb, &vlan_depth);
2440
2441         if (unlikely(!type))
2442                 return ERR_PTR(-EINVAL);
2443
2444         __skb_pull(skb, vlan_depth);
2445
2446         rcu_read_lock();
2447         list_for_each_entry_rcu(ptype, &offload_base, list) {
2448                 if (ptype->type == type && ptype->callbacks.gso_segment) {
2449                         segs = ptype->callbacks.gso_segment(skb, features);
2450                         break;
2451                 }
2452         }
2453         rcu_read_unlock();
2454
2455         __skb_push(skb, skb->data - skb_mac_header(skb));
2456
2457         return segs;
2458 }
2459 EXPORT_SYMBOL(skb_mac_gso_segment);
2460
2461
2462 /* openvswitch calls this on rx path, so we need a different check.
2463  */
2464 static inline bool skb_needs_check(struct sk_buff *skb, bool tx_path)
2465 {
2466         if (tx_path)
2467                 return skb->ip_summed != CHECKSUM_PARTIAL;
2468         else
2469                 return skb->ip_summed == CHECKSUM_NONE;
2470 }
2471
2472 /**
2473  *      __skb_gso_segment - Perform segmentation on skb.
2474  *      @skb: buffer to segment
2475  *      @features: features for the output path (see dev->features)
2476  *      @tx_path: whether it is called in TX path
2477  *
2478  *      This function segments the given skb and returns a list of segments.
2479  *
2480  *      It may return NULL if the skb requires no segmentation.  This is
2481  *      only possible when GSO is used for verifying header integrity.
2482  *
2483  *      Segmentation preserves SKB_SGO_CB_OFFSET bytes of previous skb cb.
2484  */
2485 struct sk_buff *__skb_gso_segment(struct sk_buff *skb,
2486                                   netdev_features_t features, bool tx_path)
2487 {
2488         if (unlikely(skb_needs_check(skb, tx_path))) {
2489                 int err;
2490
2491                 skb_warn_bad_offload(skb);
2492
2493                 err = skb_cow_head(skb, 0);
2494                 if (err < 0)
2495                         return ERR_PTR(err);
2496         }
2497
2498         BUILD_BUG_ON(SKB_SGO_CB_OFFSET +
2499                      sizeof(*SKB_GSO_CB(skb)) > sizeof(skb->cb));
2500
2501         SKB_GSO_CB(skb)->mac_offset = skb_headroom(skb);
2502         SKB_GSO_CB(skb)->encap_level = 0;
2503
2504         skb_reset_mac_header(skb);
2505         skb_reset_mac_len(skb);
2506
2507         return skb_mac_gso_segment(skb, features);
2508 }
2509 EXPORT_SYMBOL(__skb_gso_segment);
2510
2511 /* Take action when hardware reception checksum errors are detected. */
2512 #ifdef CONFIG_BUG
2513 void netdev_rx_csum_fault(struct net_device *dev)
2514 {
2515         if (net_ratelimit()) {
2516                 pr_err("%s: hw csum failure\n", dev ? dev->name : "<unknown>");
2517                 dump_stack();
2518         }
2519 }
2520 EXPORT_SYMBOL(netdev_rx_csum_fault);
2521 #endif
2522
2523 /* Actually, we should eliminate this check as soon as we know, that:
2524  * 1. IOMMU is present and allows to map all the memory.
2525  * 2. No high memory really exists on this machine.
2526  */
2527
2528 static int illegal_highdma(struct net_device *dev, struct sk_buff *skb)
2529 {
2530 #ifdef CONFIG_HIGHMEM
2531         int i;
2532         if (!(dev->features & NETIF_F_HIGHDMA)) {
2533                 for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
2534                         skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
2535                         if (PageHighMem(skb_frag_page(frag)))
2536                                 return 1;
2537                 }
2538         }
2539
2540         if (PCI_DMA_BUS_IS_PHYS) {
2541                 struct device *pdev = dev->dev.parent;
2542
2543                 if (!pdev)
2544                         return 0;
2545                 for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
2546                         skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
2547                         dma_addr_t addr = page_to_phys(skb_frag_page(frag));
2548                         if (!pdev->dma_mask || addr + PAGE_SIZE - 1 > *pdev->dma_mask)
2549                                 return 1;
2550                 }
2551         }
2552 #endif
2553         return 0;
2554 }
2555
2556 /* If MPLS offload request, verify we are testing hardware MPLS features
2557  * instead of standard features for the netdev.
2558  */
2559 #if IS_ENABLED(CONFIG_NET_MPLS_GSO)
2560 static netdev_features_t net_mpls_features(struct sk_buff *skb,
2561                                            netdev_features_t features,
2562                                            __be16 type)
2563 {
2564         if (eth_p_mpls(type))
2565                 features &= skb->dev->mpls_features;
2566
2567         return features;
2568 }
2569 #else
2570 static netdev_features_t net_mpls_features(struct sk_buff *skb,
2571                                            netdev_features_t features,
2572                                            __be16 type)
2573 {
2574         return features;
2575 }
2576 #endif
2577
2578 static netdev_features_t harmonize_features(struct sk_buff *skb,
2579         netdev_features_t features)
2580 {
2581         int tmp;
2582         __be16 type;
2583
2584         type = skb_network_protocol(skb, &tmp);
2585         features = net_mpls_features(skb, features, type);
2586
2587         if (skb->ip_summed != CHECKSUM_NONE &&
2588             !can_checksum_protocol(features, type)) {
2589                 features &= ~NETIF_F_ALL_CSUM;
2590         } else if (illegal_highdma(skb->dev, skb)) {
2591                 features &= ~NETIF_F_SG;
2592         }
2593
2594         return features;
2595 }
2596
2597 netdev_features_t passthru_features_check(struct sk_buff *skb,
2598                                           struct net_device *dev,
2599                                           netdev_features_t features)
2600 {
2601         return features;
2602 }
2603 EXPORT_SYMBOL(passthru_features_check);
2604
2605 static netdev_features_t dflt_features_check(const struct sk_buff *skb,
2606                                              struct net_device *dev,
2607                                              netdev_features_t features)
2608 {
2609         return vlan_features_check(skb, features);
2610 }
2611
2612 netdev_features_t netif_skb_features(struct sk_buff *skb)
2613 {
2614         struct net_device *dev = skb->dev;
2615         netdev_features_t features = dev->features;
2616         u16 gso_segs = skb_shinfo(skb)->gso_segs;
2617
2618         if (gso_segs > dev->gso_max_segs || gso_segs < dev->gso_min_segs)
2619                 features &= ~NETIF_F_GSO_MASK;
2620
2621         /* If encapsulation offload request, verify we are testing
2622          * hardware encapsulation features instead of standard
2623          * features for the netdev
2624          */
2625         if (skb->encapsulation)
2626                 features &= dev->hw_enc_features;
2627
2628         if (skb_vlan_tagged(skb))
2629                 features = netdev_intersect_features(features,
2630                                                      dev->vlan_features |
2631                                                      NETIF_F_HW_VLAN_CTAG_TX |
2632                                                      NETIF_F_HW_VLAN_STAG_TX);
2633
2634         if (dev->netdev_ops->ndo_features_check)
2635                 features &= dev->netdev_ops->ndo_features_check(skb, dev,
2636                                                                 features);
2637         else
2638                 features &= dflt_features_check(skb, dev, features);
2639
2640         return harmonize_features(skb, features);
2641 }
2642 EXPORT_SYMBOL(netif_skb_features);
2643
2644 static int xmit_one(struct sk_buff *skb, struct net_device *dev,
2645                     struct netdev_queue *txq, bool more)
2646 {
2647         unsigned int len;
2648         int rc;
2649
2650         if (!list_empty(&ptype_all) || !list_empty(&dev->ptype_all))
2651                 dev_queue_xmit_nit(skb, dev);
2652
2653         len = skb->len;
2654         trace_net_dev_start_xmit(skb, dev);
2655         rc = netdev_start_xmit(skb, dev, txq, more);
2656         trace_net_dev_xmit(skb, rc, dev, len);
2657
2658         return rc;
2659 }
2660
2661 struct sk_buff *dev_hard_start_xmit(struct sk_buff *first, struct net_device *dev,
2662                                     struct netdev_queue *txq, int *ret)
2663 {
2664         struct sk_buff *skb = first;
2665         int rc = NETDEV_TX_OK;
2666
2667         while (skb) {
2668                 struct sk_buff *next = skb->next;
2669
2670                 skb->next = NULL;
2671                 rc = xmit_one(skb, dev, txq, next != NULL);
2672                 if (unlikely(!dev_xmit_complete(rc))) {
2673                         skb->next = next;
2674                         goto out;
2675                 }
2676
2677                 skb = next;
2678                 if (netif_xmit_stopped(txq) && skb) {
2679                         rc = NETDEV_TX_BUSY;
2680                         break;
2681                 }
2682         }
2683
2684 out:
2685         *ret = rc;
2686         return skb;
2687 }
2688
2689 static struct sk_buff *validate_xmit_vlan(struct sk_buff *skb,
2690                                           netdev_features_t features)
2691 {
2692         if (skb_vlan_tag_present(skb) &&
2693             !vlan_hw_offload_capable(features, skb->vlan_proto))
2694                 skb = __vlan_hwaccel_push_inside(skb);
2695         return skb;
2696 }
2697
2698 static struct sk_buff *validate_xmit_skb(struct sk_buff *skb, struct net_device *dev)
2699 {
2700         netdev_features_t features;
2701
2702         if (skb->next)
2703                 return skb;
2704
2705         features = netif_skb_features(skb);
2706         skb = validate_xmit_vlan(skb, features);
2707         if (unlikely(!skb))
2708                 goto out_null;
2709
2710         if (netif_needs_gso(skb, features)) {
2711                 struct sk_buff *segs;
2712
2713                 segs = skb_gso_segment(skb, features);
2714                 if (IS_ERR(segs)) {
2715                         goto out_kfree_skb;
2716                 } else if (segs) {
2717                         consume_skb(skb);
2718                         skb = segs;
2719                 }
2720         } else {
2721                 if (skb_needs_linearize(skb, features) &&
2722                     __skb_linearize(skb))
2723                         goto out_kfree_skb;
2724
2725                 /* If packet is not checksummed and device does not
2726                  * support checksumming for this protocol, complete
2727                  * checksumming here.
2728                  */
2729                 if (skb->ip_summed == CHECKSUM_PARTIAL) {
2730                         if (skb->encapsulation)
2731                                 skb_set_inner_transport_header(skb,
2732                                                                skb_checksum_start_offset(skb));
2733                         else
2734                                 skb_set_transport_header(skb,
2735                                                          skb_checksum_start_offset(skb));
2736                         if (!(features & NETIF_F_ALL_CSUM) &&
2737                             skb_checksum_help(skb))
2738                                 goto out_kfree_skb;
2739                 }
2740         }
2741
2742         return skb;
2743
2744 out_kfree_skb:
2745         kfree_skb(skb);
2746 out_null:
2747         return NULL;
2748 }
2749
2750 struct sk_buff *validate_xmit_skb_list(struct sk_buff *skb, struct net_device *dev)
2751 {
2752         struct sk_buff *next, *head = NULL, *tail;
2753
2754         for (; skb != NULL; skb = next) {
2755                 next = skb->next;
2756                 skb->next = NULL;
2757
2758                 /* in case skb wont be segmented, point to itself */
2759                 skb->prev = skb;
2760
2761                 skb = validate_xmit_skb(skb, dev);
2762                 if (!skb)
2763                         continue;
2764
2765                 if (!head)
2766                         head = skb;
2767                 else
2768                         tail->next = skb;
2769                 /* If skb was segmented, skb->prev points to
2770                  * the last segment. If not, it still contains skb.
2771                  */
2772                 tail = skb->prev;
2773         }
2774         return head;
2775 }
2776
2777 static void qdisc_pkt_len_init(struct sk_buff *skb)
2778 {
2779         const struct skb_shared_info *shinfo = skb_shinfo(skb);
2780
2781         qdisc_skb_cb(skb)->pkt_len = skb->len;
2782
2783         /* To get more precise estimation of bytes sent on wire,
2784          * we add to pkt_len the headers size of all segments
2785          */
2786         if (shinfo->gso_size)  {
2787                 unsigned int hdr_len;
2788                 u16 gso_segs = shinfo->gso_segs;
2789
2790                 /* mac layer + network layer */
2791                 hdr_len = skb_transport_header(skb) - skb_mac_header(skb);
2792
2793                 /* + transport layer */
2794                 if (likely(shinfo->gso_type & (SKB_GSO_TCPV4 | SKB_GSO_TCPV6)))
2795                         hdr_len += tcp_hdrlen(skb);
2796                 else
2797                         hdr_len += sizeof(struct udphdr);
2798
2799                 if (shinfo->gso_type & SKB_GSO_DODGY)
2800                         gso_segs = DIV_ROUND_UP(skb->len - hdr_len,
2801                                                 shinfo->gso_size);
2802
2803                 qdisc_skb_cb(skb)->pkt_len += (gso_segs - 1) * hdr_len;
2804         }
2805 }
2806
2807 static inline int __dev_xmit_skb(struct sk_buff *skb, struct Qdisc *q,
2808                                  struct net_device *dev,
2809                                  struct netdev_queue *txq)
2810 {
2811         spinlock_t *root_lock = qdisc_lock(q);
2812         bool contended;
2813         int rc;
2814
2815         qdisc_pkt_len_init(skb);
2816         qdisc_calculate_pkt_len(skb, q);
2817         /*
2818          * Heuristic to force contended enqueues to serialize on a
2819          * separate lock before trying to get qdisc main lock.
2820          * This permits __QDISC___STATE_RUNNING owner to get the lock more
2821          * often and dequeue packets faster.
2822          */
2823         contended = qdisc_is_running(q);
2824         if (unlikely(contended))
2825                 spin_lock(&q->busylock);
2826
2827         spin_lock(root_lock);
2828         if (unlikely(test_bit(__QDISC_STATE_DEACTIVATED, &q->state))) {
2829                 kfree_skb(skb);
2830                 rc = NET_XMIT_DROP;
2831         } else if ((q->flags & TCQ_F_CAN_BYPASS) && !qdisc_qlen(q) &&
2832                    qdisc_run_begin(q)) {
2833                 /*
2834                  * This is a work-conserving queue; there are no old skbs
2835                  * waiting to be sent out; and the qdisc is not running -
2836                  * xmit the skb directly.
2837                  */
2838
2839                 qdisc_bstats_update(q, skb);
2840
2841                 if (sch_direct_xmit(skb, q, dev, txq, root_lock, true)) {
2842                         if (unlikely(contended)) {
2843                                 spin_unlock(&q->busylock);
2844                                 contended = false;
2845                         }
2846                         __qdisc_run(q);
2847                 } else
2848                         qdisc_run_end(q);
2849
2850                 rc = NET_XMIT_SUCCESS;
2851         } else {
2852                 rc = q->enqueue(skb, q) & NET_XMIT_MASK;
2853                 if (qdisc_run_begin(q)) {
2854                         if (unlikely(contended)) {
2855                                 spin_unlock(&q->busylock);
2856                                 contended = false;
2857                         }
2858                         __qdisc_run(q);
2859                 }
2860         }
2861         spin_unlock(root_lock);
2862         if (unlikely(contended))
2863                 spin_unlock(&q->busylock);
2864         return rc;
2865 }
2866
2867 #if IS_ENABLED(CONFIG_CGROUP_NET_PRIO)
2868 static void skb_update_prio(struct sk_buff *skb)
2869 {
2870         struct netprio_map *map = rcu_dereference_bh(skb->dev->priomap);
2871
2872         if (!skb->priority && skb->sk && map) {
2873                 unsigned int prioidx = skb->sk->sk_cgrp_prioidx;
2874
2875                 if (prioidx < map->priomap_len)
2876                         skb->priority = map->priomap[prioidx];
2877         }
2878 }
2879 #else
2880 #define skb_update_prio(skb)
2881 #endif
2882
2883 DEFINE_PER_CPU(int, xmit_recursion);
2884 EXPORT_SYMBOL(xmit_recursion);
2885
2886 #define RECURSION_LIMIT 10
2887
2888 /**
2889  *      dev_loopback_xmit - loop back @skb
2890  *      @skb: buffer to transmit
2891  */
2892 int dev_loopback_xmit(struct sock *sk, struct sk_buff *skb)
2893 {
2894         skb_reset_mac_header(skb);
2895         __skb_pull(skb, skb_network_offset(skb));
2896         skb->pkt_type = PACKET_LOOPBACK;
2897         skb->ip_summed = CHECKSUM_UNNECESSARY;
2898         WARN_ON(!skb_dst(skb));
2899         skb_dst_force(skb);
2900         netif_rx_ni(skb);
2901         return 0;
2902 }
2903 EXPORT_SYMBOL(dev_loopback_xmit);
2904
2905 /**
2906  *      __dev_queue_xmit - transmit a buffer
2907  *      @skb: buffer to transmit
2908  *      @accel_priv: private data used for L2 forwarding offload
2909  *
2910  *      Queue a buffer for transmission to a network device. The caller must
2911  *      have set the device and priority and built the buffer before calling
2912  *      this function. The function can be called from an interrupt.
2913  *
2914  *      A negative errno code is returned on a failure. A success does not
2915  *      guarantee the frame will be transmitted as it may be dropped due
2916  *      to congestion or traffic shaping.
2917  *
2918  * -----------------------------------------------------------------------------------
2919  *      I notice this method can also return errors from the queue disciplines,
2920  *      including NET_XMIT_DROP, which is a positive value.  So, errors can also
2921  *      be positive.
2922  *
2923  *      Regardless of the return value, the skb is consumed, so it is currently
2924  *      difficult to retry a send to this method.  (You can bump the ref count
2925  *      before sending to hold a reference for retry if you are careful.)
2926  *
2927  *      When calling this method, interrupts MUST be enabled.  This is because
2928  *      the BH enable code must have IRQs enabled so that it will not deadlock.
2929  *          --BLG
2930  */
2931 static int __dev_queue_xmit(struct sk_buff *skb, void *accel_priv)
2932 {
2933         struct net_device *dev = skb->dev;
2934         struct netdev_queue *txq;
2935         struct Qdisc *q;
2936         int rc = -ENOMEM;
2937
2938         skb_reset_mac_header(skb);
2939
2940         if (unlikely(skb_shinfo(skb)->tx_flags & SKBTX_SCHED_TSTAMP))
2941                 __skb_tstamp_tx(skb, NULL, skb->sk, SCM_TSTAMP_SCHED);
2942
2943         /* Disable soft irqs for various locks below. Also
2944          * stops preemption for RCU.
2945          */
2946         rcu_read_lock_bh();
2947
2948         skb_update_prio(skb);
2949
2950         /* If device/qdisc don't need skb->dst, release it right now while
2951          * its hot in this cpu cache.
2952          */
2953         if (dev->priv_flags & IFF_XMIT_DST_RELEASE)
2954                 skb_dst_drop(skb);
2955         else
2956                 skb_dst_force(skb);
2957
2958         txq = netdev_pick_tx(dev, skb, accel_priv);
2959         q = rcu_dereference_bh(txq->qdisc);
2960
2961 #ifdef CONFIG_NET_CLS_ACT
2962         skb->tc_verd = SET_TC_AT(skb->tc_verd, AT_EGRESS);
2963 #endif
2964         trace_net_dev_queue(skb);
2965         if (q->enqueue) {
2966                 rc = __dev_xmit_skb(skb, q, dev, txq);
2967                 goto out;
2968         }
2969
2970         /* The device has no queue. Common case for software devices:
2971            loopback, all the sorts of tunnels...
2972
2973            Really, it is unlikely that netif_tx_lock protection is necessary
2974            here.  (f.e. loopback and IP tunnels are clean ignoring statistics
2975            counters.)
2976            However, it is possible, that they rely on protection
2977            made by us here.
2978
2979            Check this and shot the lock. It is not prone from deadlocks.
2980            Either shot noqueue qdisc, it is even simpler 8)
2981          */
2982         if (dev->flags & IFF_UP) {
2983                 int cpu = smp_processor_id(); /* ok because BHs are off */
2984
2985                 if (txq->xmit_lock_owner != cpu) {
2986
2987                         if (__this_cpu_read(xmit_recursion) > RECURSION_LIMIT)
2988                                 goto recursion_alert;
2989
2990                         skb = validate_xmit_skb(skb, dev);
2991                         if (!skb)
2992                                 goto drop;
2993
2994                         HARD_TX_LOCK(dev, txq, cpu);
2995
2996                         if (!netif_xmit_stopped(txq)) {
2997                                 __this_cpu_inc(xmit_recursion);
2998                                 skb = dev_hard_start_xmit(skb, dev, txq, &rc);
2999                                 __this_cpu_dec(xmit_recursion);
3000                                 if (dev_xmit_complete(rc)) {
3001                                         HARD_TX_UNLOCK(dev, txq);
3002                                         goto out;
3003                                 }
3004                         }
3005                         HARD_TX_UNLOCK(dev, txq);
3006                         net_crit_ratelimited("Virtual device %s asks to queue packet!\n",
3007                                              dev->name);
3008                 } else {
3009                         /* Recursion is detected! It is possible,
3010                          * unfortunately
3011                          */
3012 recursion_alert:
3013                         net_crit_ratelimited("Dead loop on virtual device %s, fix it urgently!\n",
3014                                              dev->name);
3015                 }
3016         }
3017
3018         rc = -ENETDOWN;
3019 drop:
3020         rcu_read_unlock_bh();
3021
3022         atomic_long_inc(&dev->tx_dropped);
3023         kfree_skb_list(skb);
3024         return rc;
3025 out:
3026         rcu_read_unlock_bh();
3027         return rc;
3028 }
3029
3030 int dev_queue_xmit_sk(struct sock *sk, struct sk_buff *skb)
3031 {
3032         return __dev_queue_xmit(skb, NULL);
3033 }
3034 EXPORT_SYMBOL(dev_queue_xmit_sk);
3035
3036 int dev_queue_xmit_accel(struct sk_buff *skb, void *accel_priv)
3037 {
3038         return __dev_queue_xmit(skb, accel_priv);
3039 }
3040 EXPORT_SYMBOL(dev_queue_xmit_accel);
3041
3042
3043 /*=======================================================================
3044                         Receiver routines
3045   =======================================================================*/
3046
3047 int netdev_max_backlog __read_mostly = 1000;
3048 EXPORT_SYMBOL(netdev_max_backlog);
3049
3050 int netdev_tstamp_prequeue __read_mostly = 1;
3051 int netdev_budget __read_mostly = 300;
3052 int weight_p __read_mostly = 64;            /* old backlog weight */
3053
3054 /* Called with irq disabled */
3055 static inline void ____napi_schedule(struct softnet_data *sd,
3056                                      struct napi_struct *napi)
3057 {
3058         list_add_tail(&napi->poll_list, &sd->poll_list);
3059         __raise_softirq_irqoff(NET_RX_SOFTIRQ);
3060 }
3061
3062 #ifdef CONFIG_RPS
3063
3064 /* One global table that all flow-based protocols share. */
3065 struct rps_sock_flow_table __rcu *rps_sock_flow_table __read_mostly;
3066 EXPORT_SYMBOL(rps_sock_flow_table);
3067 u32 rps_cpu_mask __read_mostly;
3068 EXPORT_SYMBOL(rps_cpu_mask);
3069
3070 struct static_key rps_needed __read_mostly;
3071
3072 static struct rps_dev_flow *
3073 set_rps_cpu(struct net_device *dev, struct sk_buff *skb,
3074             struct rps_dev_flow *rflow, u16 next_cpu)
3075 {
3076         if (next_cpu < nr_cpu_ids) {
3077 #ifdef CONFIG_RFS_ACCEL
3078                 struct netdev_rx_queue *rxqueue;
3079                 struct rps_dev_flow_table *flow_table;
3080                 struct rps_dev_flow *old_rflow;
3081                 u32 flow_id;
3082                 u16 rxq_index;
3083                 int rc;
3084
3085                 /* Should we steer this flow to a different hardware queue? */
3086                 if (!skb_rx_queue_recorded(skb) || !dev->rx_cpu_rmap ||
3087                     !(dev->features & NETIF_F_NTUPLE))
3088                         goto out;
3089                 rxq_index = cpu_rmap_lookup_index(dev->rx_cpu_rmap, next_cpu);
3090                 if (rxq_index == skb_get_rx_queue(skb))
3091                         goto out;
3092
3093                 rxqueue = dev->_rx + rxq_index;
3094                 flow_table = rcu_dereference(rxqueue->rps_flow_table);
3095                 if (!flow_table)
3096                         goto out;
3097                 flow_id = skb_get_hash(skb) & flow_table->mask;
3098                 rc = dev->netdev_ops->ndo_rx_flow_steer(dev, skb,
3099                                                         rxq_index, flow_id);
3100                 if (rc < 0)
3101                         goto out;
3102                 old_rflow = rflow;
3103                 rflow = &flow_table->flows[flow_id];
3104                 rflow->filter = rc;
3105                 if (old_rflow->filter == rflow->filter)
3106                         old_rflow->filter = RPS_NO_FILTER;
3107         out:
3108 #endif
3109                 rflow->last_qtail =
3110                         per_cpu(softnet_data, next_cpu).input_queue_head;
3111         }
3112
3113         rflow->cpu = next_cpu;
3114         return rflow;
3115 }
3116
3117 /*
3118  * get_rps_cpu is called from netif_receive_skb and returns the target
3119  * CPU from the RPS map of the receiving queue for a given skb.
3120  * rcu_read_lock must be held on entry.
3121  */
3122 static int get_rps_cpu(struct net_device *dev, struct sk_buff *skb,
3123                        struct rps_dev_flow **rflowp)
3124 {
3125         const struct rps_sock_flow_table *sock_flow_table;
3126         struct netdev_rx_queue *rxqueue = dev->_rx;
3127         struct rps_dev_flow_table *flow_table;
3128         struct rps_map *map;
3129         int cpu = -1;
3130         u32 tcpu;
3131         u32 hash;
3132
3133         if (skb_rx_queue_recorded(skb)) {
3134                 u16 index = skb_get_rx_queue(skb);
3135
3136                 if (unlikely(index >= dev->real_num_rx_queues)) {
3137                         WARN_ONCE(dev->real_num_rx_queues > 1,
3138                                   "%s received packet on queue %u, but number "
3139                                   "of RX queues is %u\n",
3140                                   dev->name, index, dev->real_num_rx_queues);
3141                         goto done;
3142                 }
3143                 rxqueue += index;
3144         }
3145
3146         /* Avoid computing hash if RFS/RPS is not active for this rxqueue */
3147
3148         flow_table = rcu_dereference(rxqueue->rps_flow_table);
3149         map = rcu_dereference(rxqueue->rps_map);
3150         if (!flow_table && !map)
3151                 goto done;
3152
3153         skb_reset_network_header(skb);
3154         hash = skb_get_hash(skb);
3155         if (!hash)
3156                 goto done;
3157
3158         sock_flow_table = rcu_dereference(rps_sock_flow_table);
3159         if (flow_table && sock_flow_table) {
3160                 struct rps_dev_flow *rflow;
3161                 u32 next_cpu;
3162                 u32 ident;
3163
3164                 /* First check into global flow table if there is a match */
3165                 ident = sock_flow_table->ents[hash & sock_flow_table->mask];
3166                 if ((ident ^ hash) & ~rps_cpu_mask)
3167                         goto try_rps;
3168
3169                 next_cpu = ident & rps_cpu_mask;
3170
3171                 /* OK, now we know there is a match,
3172                  * we can look at the local (per receive queue) flow table
3173                  */
3174                 rflow = &flow_table->flows[hash & flow_table->mask];
3175                 tcpu = rflow->cpu;
3176
3177                 /*
3178                  * If the desired CPU (where last recvmsg was done) is
3179                  * different from current CPU (one in the rx-queue flow
3180                  * table entry), switch if one of the following holds:
3181                  *   - Current CPU is unset (>= nr_cpu_ids).
3182                  *   - Current CPU is offline.
3183                  *   - The current CPU's queue tail has advanced beyond the
3184                  *     last packet that was enqueued using this table entry.
3185                  *     This guarantees that all previous packets for the flow
3186                  *     have been dequeued, thus preserving in order delivery.
3187                  */
3188                 if (unlikely(tcpu != next_cpu) &&
3189                     (tcpu >= nr_cpu_ids || !cpu_online(tcpu) ||
3190                      ((int)(per_cpu(softnet_data, tcpu).input_queue_head -
3191                       rflow->last_qtail)) >= 0)) {
3192                         tcpu = next_cpu;
3193                         rflow = set_rps_cpu(dev, skb, rflow, next_cpu);
3194                 }
3195
3196                 if (tcpu < nr_cpu_ids && cpu_online(tcpu)) {
3197                         *rflowp = rflow;
3198                         cpu = tcpu;
3199                         goto done;
3200                 }
3201         }
3202
3203 try_rps:
3204
3205         if (map) {
3206                 tcpu = map->cpus[reciprocal_scale(hash, map->len)];
3207                 if (cpu_online(tcpu)) {
3208                         cpu = tcpu;
3209                         goto done;
3210                 }
3211         }
3212
3213 done:
3214         return cpu;
3215 }
3216
3217 #ifdef CONFIG_RFS_ACCEL
3218
3219 /**
3220  * rps_may_expire_flow - check whether an RFS hardware filter may be removed
3221  * @dev: Device on which the filter was set
3222  * @rxq_index: RX queue index
3223  * @flow_id: Flow ID passed to ndo_rx_flow_steer()
3224  * @filter_id: Filter ID returned by ndo_rx_flow_steer()
3225  *
3226  * Drivers that implement ndo_rx_flow_steer() should periodically call
3227  * this function for each installed filter and remove the filters for
3228  * which it returns %true.
3229  */
3230 bool rps_may_expire_flow(struct net_device *dev, u16 rxq_index,
3231                          u32 flow_id, u16 filter_id)
3232 {
3233         struct netdev_rx_queue *rxqueue = dev->_rx + rxq_index;
3234         struct rps_dev_flow_table *flow_table;
3235         struct rps_dev_flow *rflow;
3236         bool expire = true;
3237         unsigned int cpu;
3238
3239         rcu_read_lock();
3240         flow_table = rcu_dereference(rxqueue->rps_flow_table);
3241         if (flow_table && flow_id <= flow_table->mask) {
3242                 rflow = &flow_table->flows[flow_id];
3243                 cpu = ACCESS_ONCE(rflow->cpu);
3244                 if (rflow->filter == filter_id && cpu < nr_cpu_ids &&
3245                     ((int)(per_cpu(softnet_data, cpu).input_queue_head -
3246                            rflow->last_qtail) <
3247                      (int)(10 * flow_table->mask)))
3248                         expire = false;
3249         }
3250         rcu_read_unlock();
3251         return expire;
3252 }
3253 EXPORT_SYMBOL(rps_may_expire_flow);
3254
3255 #endif /* CONFIG_RFS_ACCEL */
3256
3257 /* Called from hardirq (IPI) context */
3258 static void rps_trigger_softirq(void *data)
3259 {
3260         struct softnet_data *sd = data;
3261
3262         ____napi_schedule(sd, &sd->backlog);
3263         sd->received_rps++;
3264 }
3265
3266 #endif /* CONFIG_RPS */
3267
3268 /*
3269  * Check if this softnet_data structure is another cpu one
3270  * If yes, queue it to our IPI list and return 1
3271  * If no, return 0
3272  */
3273 static int rps_ipi_queued(struct softnet_data *sd)
3274 {
3275 #ifdef CONFIG_RPS
3276         struct softnet_data *mysd = this_cpu_ptr(&softnet_data);
3277
3278         if (sd != mysd) {
3279                 sd->rps_ipi_next = mysd->rps_ipi_list;
3280                 mysd->rps_ipi_list = sd;
3281
3282                 __raise_softirq_irqoff(NET_RX_SOFTIRQ);
3283                 return 1;
3284         }
3285 #endif /* CONFIG_RPS */
3286         return 0;
3287 }
3288
3289 #ifdef CONFIG_NET_FLOW_LIMIT
3290 int netdev_flow_limit_table_len __read_mostly = (1 << 12);
3291 #endif
3292
3293 static bool skb_flow_limit(struct sk_buff *skb, unsigned int qlen)
3294 {
3295 #ifdef CONFIG_NET_FLOW_LIMIT
3296         struct sd_flow_limit *fl;
3297         struct softnet_data *sd;
3298         unsigned int old_flow, new_flow;
3299
3300         if (qlen < (netdev_max_backlog >> 1))
3301                 return false;
3302
3303         sd = this_cpu_ptr(&softnet_data);
3304
3305         rcu_read_lock();
3306         fl = rcu_dereference(sd->flow_limit);
3307         if (fl) {
3308                 new_flow = skb_get_hash(skb) & (fl->num_buckets - 1);
3309                 old_flow = fl->history[fl->history_head];
3310                 fl->history[fl->history_head] = new_flow;
3311
3312                 fl->history_head++;
3313                 fl->history_head &= FLOW_LIMIT_HISTORY - 1;
3314
3315                 if (likely(fl->buckets[old_flow]))
3316                         fl->buckets[old_flow]--;
3317
3318                 if (++fl->buckets[new_flow] > (FLOW_LIMIT_HISTORY >> 1)) {
3319                         fl->count++;
3320                         rcu_read_unlock();
3321                         return true;
3322                 }
3323         }
3324         rcu_read_unlock();
3325 #endif
3326         return false;
3327 }
3328
3329 /*
3330  * enqueue_to_backlog is called to queue an skb to a per CPU backlog
3331  * queue (may be a remote CPU queue).
3332  */
3333 static int enqueue_to_backlog(struct sk_buff *skb, int cpu,
3334                               unsigned int *qtail)
3335 {
3336         struct softnet_data *sd;
3337         unsigned long flags;
3338         unsigned int qlen;
3339
3340         sd = &per_cpu(softnet_data, cpu);
3341
3342         local_irq_save(flags);
3343
3344         rps_lock(sd);
3345         if (!netif_running(skb->dev))
3346                 goto drop;
3347         qlen = skb_queue_len(&sd->input_pkt_queue);
3348         if (qlen <= netdev_max_backlog && !skb_flow_limit(skb, qlen)) {
3349                 if (qlen) {
3350 enqueue:
3351                         __skb_queue_tail(&sd->input_pkt_queue, skb);
3352                         input_queue_tail_incr_save(sd, qtail);
3353                         rps_unlock(sd);
3354                         local_irq_restore(flags);
3355                         return NET_RX_SUCCESS;
3356                 }
3357
3358                 /* Schedule NAPI for backlog device
3359                  * We can use non atomic operation since we own the queue lock
3360                  */
3361                 if (!__test_and_set_bit(NAPI_STATE_SCHED, &sd->backlog.state)) {
3362                         if (!rps_ipi_queued(sd))
3363                                 ____napi_schedule(sd, &sd->backlog);
3364                 }
3365                 goto enqueue;
3366         }
3367
3368 drop:
3369         sd->dropped++;
3370         rps_unlock(sd);
3371
3372         local_irq_restore(flags);
3373
3374         atomic_long_inc(&skb->dev->rx_dropped);
3375         kfree_skb(skb);
3376         return NET_RX_DROP;
3377 }
3378
3379 static int netif_rx_internal(struct sk_buff *skb)
3380 {
3381         int ret;
3382
3383         net_timestamp_check(netdev_tstamp_prequeue, skb);
3384
3385         trace_netif_rx(skb);
3386 #ifdef CONFIG_RPS
3387         if (static_key_false(&rps_needed)) {
3388                 struct rps_dev_flow voidflow, *rflow = &voidflow;
3389                 int cpu;
3390
3391                 preempt_disable();
3392                 rcu_read_lock();
3393
3394                 cpu = get_rps_cpu(skb->dev, skb, &rflow);
3395                 if (cpu < 0)
3396                         cpu = smp_processor_id();
3397
3398                 ret = enqueue_to_backlog(skb, cpu, &rflow->last_qtail);
3399
3400                 rcu_read_unlock();
3401                 preempt_enable();
3402         } else
3403 #endif
3404         {
3405                 unsigned int qtail;
3406                 ret = enqueue_to_backlog(skb, get_cpu(), &qtail);
3407                 put_cpu();
3408         }
3409         return ret;
3410 }
3411
3412 /**
3413  *      netif_rx        -       post buffer to the network code
3414  *      @skb: buffer to post
3415  *
3416  *      This function receives a packet from a device driver and queues it for
3417  *      the upper (protocol) levels to process.  It always succeeds. The buffer
3418  *      may be dropped during processing for congestion control or by the
3419  *      protocol layers.
3420  *
3421  *      return values:
3422  *      NET_RX_SUCCESS  (no congestion)
3423  *      NET_RX_DROP     (packet was dropped)
3424  *
3425  */
3426
3427 int netif_rx(struct sk_buff *skb)
3428 {
3429         trace_netif_rx_entry(skb);
3430
3431         return netif_rx_internal(skb);
3432 }
3433 EXPORT_SYMBOL(netif_rx);
3434
3435 int netif_rx_ni(struct sk_buff *skb)
3436 {
3437         int err;
3438
3439         trace_netif_rx_ni_entry(skb);
3440
3441         preempt_disable();
3442         err = netif_rx_internal(skb);
3443         if (local_softirq_pending())
3444                 do_softirq();
3445         preempt_enable();
3446
3447         return err;
3448 }
3449 EXPORT_SYMBOL(netif_rx_ni);
3450
3451 static void net_tx_action(struct softirq_action *h)
3452 {
3453         struct softnet_data *sd = this_cpu_ptr(&softnet_data);
3454
3455         if (sd->completion_queue) {
3456                 struct sk_buff *clist;
3457
3458                 local_irq_disable();
3459                 clist = sd->completion_queue;
3460                 sd->completion_queue = NULL;
3461                 local_irq_enable();
3462
3463                 while (clist) {
3464                         struct sk_buff *skb = clist;
3465                         clist = clist->next;
3466
3467                         WARN_ON(atomic_read(&skb->users));
3468                         if (likely(get_kfree_skb_cb(skb)->reason == SKB_REASON_CONSUMED))
3469                                 trace_consume_skb(skb);
3470                         else
3471                                 trace_kfree_skb(skb, net_tx_action);
3472                         __kfree_skb(skb);
3473                 }
3474         }
3475
3476         if (sd->output_queue) {
3477                 struct Qdisc *head;
3478
3479                 local_irq_disable();
3480                 head = sd->output_queue;
3481                 sd->output_queue = NULL;
3482                 sd->output_queue_tailp = &sd->output_queue;
3483                 local_irq_enable();
3484
3485                 while (head) {
3486                         struct Qdisc *q = head;
3487                         spinlock_t *root_lock;
3488
3489                         head = head->next_sched;
3490
3491                         root_lock = qdisc_lock(q);
3492                         if (spin_trylock(root_lock)) {
3493                                 smp_mb__before_atomic();
3494                                 clear_bit(__QDISC_STATE_SCHED,
3495                                           &q->state);
3496                                 qdisc_run(q);
3497                                 spin_unlock(root_lock);
3498                         } else {
3499                                 if (!test_bit(__QDISC_STATE_DEACTIVATED,
3500                                               &q->state)) {
3501                                         __netif_reschedule(q);
3502                                 } else {
3503                                         smp_mb__before_atomic();
3504                                         clear_bit(__QDISC_STATE_SCHED,
3505                                                   &q->state);
3506                                 }
3507                         }
3508                 }
3509         }
3510 }
3511
3512 #if (defined(CONFIG_BRIDGE) || defined(CONFIG_BRIDGE_MODULE)) && \
3513     (defined(CONFIG_ATM_LANE) || defined(CONFIG_ATM_LANE_MODULE))
3514 /* This hook is defined here for ATM LANE */
3515 int (*br_fdb_test_addr_hook)(struct net_device *dev,
3516                              unsigned char *addr) __read_mostly;
3517 EXPORT_SYMBOL_GPL(br_fdb_test_addr_hook);
3518 #endif
3519
3520 #ifdef CONFIG_NET_CLS_ACT
3521 /* TODO: Maybe we should just force sch_ingress to be compiled in
3522  * when CONFIG_NET_CLS_ACT is? otherwise some useless instructions
3523  * a compare and 2 stores extra right now if we dont have it on
3524  * but have CONFIG_NET_CLS_ACT
3525  * NOTE: This doesn't stop any functionality; if you dont have
3526  * the ingress scheduler, you just can't add policies on ingress.
3527  *
3528  */
3529 static int ing_filter(struct sk_buff *skb, struct netdev_queue *rxq)
3530 {
3531         struct net_device *dev = skb->dev;
3532         u32 ttl = G_TC_RTTL(skb->tc_verd);
3533         int result = TC_ACT_OK;
3534         struct Qdisc *q;
3535
3536         if (unlikely(MAX_RED_LOOP < ttl++)) {
3537                 net_warn_ratelimited("Redir loop detected Dropping packet (%d->%d)\n",
3538                                      skb->skb_iif, dev->ifindex);
3539                 return TC_ACT_SHOT;
3540         }
3541
3542         skb->tc_verd = SET_TC_RTTL(skb->tc_verd, ttl);
3543         skb->tc_verd = SET_TC_AT(skb->tc_verd, AT_INGRESS);
3544
3545         q = rcu_dereference(rxq->qdisc);
3546         if (q != &noop_qdisc) {
3547                 spin_lock(qdisc_lock(q));
3548                 if (likely(!test_bit(__QDISC_STATE_DEACTIVATED, &q->state)))
3549                         result = qdisc_enqueue_root(skb, q);
3550                 spin_unlock(qdisc_lock(q));
3551         }
3552
3553         return result;
3554 }
3555
3556 static inline struct sk_buff *handle_ing(struct sk_buff *skb,
3557                                          struct packet_type **pt_prev,
3558                                          int *ret, struct net_device *orig_dev)
3559 {
3560         struct netdev_queue *rxq = rcu_dereference(skb->dev->ingress_queue);
3561
3562         if (!rxq || rcu_access_pointer(rxq->qdisc) == &noop_qdisc)
3563                 return skb;
3564
3565         if (*pt_prev) {
3566                 *ret = deliver_skb(skb, *pt_prev, orig_dev);
3567                 *pt_prev = NULL;
3568         }
3569
3570         switch (ing_filter(skb, rxq)) {
3571         case TC_ACT_SHOT:
3572         case TC_ACT_STOLEN:
3573                 kfree_skb(skb);
3574                 return NULL;
3575         }
3576
3577         return skb;
3578 }
3579 #endif
3580
3581 /**
3582  *      netdev_rx_handler_register - register receive handler
3583  *      @dev: device to register a handler for
3584  *      @rx_handler: receive handler to register
3585  *      @rx_handler_data: data pointer that is used by rx handler
3586  *
3587  *      Register a receive handler for a device. This handler will then be
3588  *      called from __netif_receive_skb. A negative errno code is returned
3589  *      on a failure.
3590  *
3591  *      The caller must hold the rtnl_mutex.
3592  *
3593  *      For a general description of rx_handler, see enum rx_handler_result.
3594  */
3595 int netdev_rx_handler_register(struct net_device *dev,
3596                                rx_handler_func_t *rx_handler,
3597                                void *rx_handler_data)
3598 {
3599         ASSERT_RTNL();
3600
3601         if (dev->rx_handler)
3602                 return -EBUSY;
3603
3604         /* Note: rx_handler_data must be set before rx_handler */
3605         rcu_assign_pointer(dev->rx_handler_data, rx_handler_data);
3606         rcu_assign_pointer(dev->rx_handler, rx_handler);
3607
3608         return 0;
3609 }
3610 EXPORT_SYMBOL_GPL(netdev_rx_handler_register);
3611
3612 /**
3613  *      netdev_rx_handler_unregister - unregister receive handler
3614  *      @dev: device to unregister a handler from
3615  *
3616  *      Unregister a receive handler from a device.
3617  *
3618  *      The caller must hold the rtnl_mutex.
3619  */
3620 void netdev_rx_handler_unregister(struct net_device *dev)
3621 {
3622
3623         ASSERT_RTNL();
3624         RCU_INIT_POINTER(dev->rx_handler, NULL);
3625         /* a reader seeing a non NULL rx_handler in a rcu_read_lock()
3626          * section has a guarantee to see a non NULL rx_handler_data
3627          * as well.
3628          */
3629         synchronize_net();
3630         RCU_INIT_POINTER(dev->rx_handler_data, NULL);
3631 }
3632 EXPORT_SYMBOL_GPL(netdev_rx_handler_unregister);
3633
3634 /*
3635  * Limit the use of PFMEMALLOC reserves to those protocols that implement
3636  * the special handling of PFMEMALLOC skbs.
3637  */
3638 static bool skb_pfmemalloc_protocol(struct sk_buff *skb)
3639 {
3640         switch (skb->protocol) {
3641         case htons(ETH_P_ARP):
3642         case htons(ETH_P_IP):
3643         case htons(ETH_P_IPV6):
3644         case htons(ETH_P_8021Q):
3645         case htons(ETH_P_8021AD):
3646                 return true;
3647         default:
3648                 return false;
3649         }
3650 }
3651
3652 static int __netif_receive_skb_core(struct sk_buff *skb, bool pfmemalloc)
3653 {
3654         struct packet_type *ptype, *pt_prev;
3655         rx_handler_func_t *rx_handler;
3656         struct net_device *orig_dev;
3657         bool deliver_exact = false;
3658         int ret = NET_RX_DROP;
3659         __be16 type;
3660
3661         net_timestamp_check(!netdev_tstamp_prequeue, skb);
3662
3663         trace_netif_receive_skb(skb);
3664
3665         orig_dev = skb->dev;
3666
3667         skb_reset_network_header(skb);
3668         if (!skb_transport_header_was_set(skb))
3669                 skb_reset_transport_header(skb);
3670         skb_reset_mac_len(skb);
3671
3672         pt_prev = NULL;
3673
3674 another_round:
3675         skb->skb_iif = skb->dev->ifindex;
3676
3677         __this_cpu_inc(softnet_data.processed);
3678
3679         if (skb->protocol == cpu_to_be16(ETH_P_8021Q) ||
3680             skb->protocol == cpu_to_be16(ETH_P_8021AD)) {
3681                 skb = skb_vlan_untag(skb);
3682                 if (unlikely(!skb))
3683                         goto out;
3684         }
3685
3686 #ifdef CONFIG_NET_CLS_ACT
3687         if (skb->tc_verd & TC_NCLS) {
3688                 skb->tc_verd = CLR_TC_NCLS(skb->tc_verd);
3689                 goto ncls;
3690         }
3691 #endif
3692
3693         if (pfmemalloc)
3694                 goto skip_taps;
3695
3696         list_for_each_entry_rcu(ptype, &ptype_all, list) {
3697                 if (pt_prev)
3698                         ret = deliver_skb(skb, pt_prev, orig_dev);
3699                 pt_prev = ptype;
3700         }
3701
3702         list_for_each_entry_rcu(ptype, &skb->dev->ptype_all, list) {
3703                 if (pt_prev)
3704                         ret = deliver_skb(skb, pt_prev, orig_dev);
3705                 pt_prev = ptype;
3706         }
3707
3708 skip_taps:
3709 #ifdef CONFIG_NET_CLS_ACT
3710         if (static_key_false(&ingress_needed)) {
3711                 skb = handle_ing(skb, &pt_prev, &ret, orig_dev);
3712                 if (!skb)
3713                         goto out;
3714         }
3715
3716         skb->tc_verd = 0;
3717 ncls:
3718 #endif
3719         if (pfmemalloc && !skb_pfmemalloc_protocol(skb))
3720                 goto drop;
3721
3722         if (skb_vlan_tag_present(skb)) {
3723                 if (pt_prev) {
3724                         ret = deliver_skb(skb, pt_prev, orig_dev);
3725                         pt_prev = NULL;
3726                 }
3727                 if (vlan_do_receive(&skb))
3728                         goto another_round;
3729                 else if (unlikely(!skb))
3730                         goto out;
3731         }
3732
3733         rx_handler = rcu_dereference(skb->dev->rx_handler);
3734         if (rx_handler) {
3735                 if (pt_prev) {
3736                         ret = deliver_skb(skb, pt_prev, orig_dev);
3737                         pt_prev = NULL;
3738                 }
3739                 switch (rx_handler(&skb)) {
3740                 case RX_HANDLER_CONSUMED:
3741                         ret = NET_RX_SUCCESS;
3742                         goto out;
3743                 case RX_HANDLER_ANOTHER:
3744                         goto another_round;
3745                 case RX_HANDLER_EXACT:
3746                         deliver_exact = true;
3747                 case RX_HANDLER_PASS:
3748                         break;
3749                 default:
3750                         BUG();
3751                 }
3752         }
3753
3754         if (unlikely(skb_vlan_tag_present(skb))) {
3755                 if (skb_vlan_tag_get_id(skb))
3756                         skb->pkt_type = PACKET_OTHERHOST;
3757                 /* Note: we might in the future use prio bits
3758                  * and set skb->priority like in vlan_do_receive()
3759                  * For the time being, just ignore Priority Code Point
3760                  */
3761                 skb->vlan_tci = 0;
3762         }
3763
3764         type = skb->protocol;
3765
3766         /* deliver only exact match when indicated */
3767         if (likely(!deliver_exact)) {
3768                 deliver_ptype_list_skb(skb, &pt_prev, orig_dev, type,
3769                                        &ptype_base[ntohs(type) &
3770                                                    PTYPE_HASH_MASK]);
3771         }
3772
3773         deliver_ptype_list_skb(skb, &pt_prev, orig_dev, type,
3774                                &orig_dev->ptype_specific);
3775
3776         if (unlikely(skb->dev != orig_dev)) {
3777                 deliver_ptype_list_skb(skb, &pt_prev, orig_dev, type,
3778                                        &skb->dev->ptype_specific);
3779         }
3780
3781         if (pt_prev) {
3782                 if (unlikely(skb_orphan_frags(skb, GFP_ATOMIC)))
3783                         goto drop;
3784                 else
3785                         ret = pt_prev->func(skb, skb->dev, pt_prev, orig_dev);
3786         } else {
3787 drop:
3788                 atomic_long_inc(&skb->dev->rx_dropped);
3789                 kfree_skb(skb);
3790                 /* Jamal, now you will not able to escape explaining
3791                  * me how you were going to use this. :-)
3792                  */
3793                 ret = NET_RX_DROP;
3794         }
3795
3796 out:
3797         return ret;
3798 }
3799
3800 static int __netif_receive_skb(struct sk_buff *skb)
3801 {
3802         int ret;
3803
3804         if (sk_memalloc_socks() && skb_pfmemalloc(skb)) {
3805                 unsigned long pflags = current->flags;
3806
3807                 /*
3808                  * PFMEMALLOC skbs are special, they should
3809                  * - be delivered to SOCK_MEMALLOC sockets only
3810                  * - stay away from userspace
3811                  * - have bounded memory usage
3812                  *
3813                  * Use PF_MEMALLOC as this saves us from propagating the allocation
3814                  * context down to all allocation sites.
3815                  */
3816                 current->flags |= PF_MEMALLOC;
3817                 ret = __netif_receive_skb_core(skb, true);
3818                 tsk_restore_flags(current, pflags, PF_MEMALLOC);
3819         } else
3820                 ret = __netif_receive_skb_core(skb, false);
3821
3822         return ret;
3823 }
3824
3825 static int netif_receive_skb_internal(struct sk_buff *skb)
3826 {
3827         int ret;
3828
3829         net_timestamp_check(netdev_tstamp_prequeue, skb);
3830
3831         if (skb_defer_rx_timestamp(skb))
3832                 return NET_RX_SUCCESS;
3833
3834         rcu_read_lock();
3835
3836 #ifdef CONFIG_RPS
3837         if (static_key_false(&rps_needed)) {
3838                 struct rps_dev_flow voidflow, *rflow = &voidflow;
3839                 int cpu = get_rps_cpu(skb->dev, skb, &rflow);
3840
3841                 if (cpu >= 0) {
3842                         ret = enqueue_to_backlog(skb, cpu, &rflow->last_qtail);
3843                         rcu_read_unlock();
3844                         return ret;
3845                 }
3846         }
3847 #endif
3848         ret = __netif_receive_skb(skb);
3849         rcu_read_unlock();
3850         return ret;
3851 }
3852
3853 /**
3854  *      netif_receive_skb - process receive buffer from network
3855  *      @skb: buffer to process
3856  *
3857  *      netif_receive_skb() is the main receive data processing function.
3858  *      It always succeeds. The buffer may be dropped during processing
3859  *      for congestion control or by the protocol layers.
3860  *
3861  *      This function may only be called from softirq context and interrupts
3862  *      should be enabled.
3863  *
3864  *      Return values (usually ignored):
3865  *      NET_RX_SUCCESS: no congestion
3866  *      NET_RX_DROP: packet was dropped
3867  */
3868 int netif_receive_skb_sk(struct sock *sk, struct sk_buff *skb)
3869 {
3870         trace_netif_receive_skb_entry(skb);
3871
3872         return netif_receive_skb_internal(skb);
3873 }
3874 EXPORT_SYMBOL(netif_receive_skb_sk);
3875
3876 /* Network device is going away, flush any packets still pending
3877  * Called with irqs disabled.
3878  */
3879 static void flush_backlog(void *arg)
3880 {
3881         struct net_device *dev = arg;
3882         struct softnet_data *sd = this_cpu_ptr(&softnet_data);
3883         struct sk_buff *skb, *tmp;
3884
3885         rps_lock(sd);
3886         skb_queue_walk_safe(&sd->input_pkt_queue, skb, tmp) {
3887                 if (skb->dev == dev) {
3888                         __skb_unlink(skb, &sd->input_pkt_queue);
3889                         kfree_skb(skb);
3890                         input_queue_head_incr(sd);
3891                 }
3892         }
3893         rps_unlock(sd);
3894
3895         skb_queue_walk_safe(&sd->process_queue, skb, tmp) {
3896                 if (skb->dev == dev) {
3897                         __skb_unlink(skb, &sd->process_queue);
3898                         kfree_skb(skb);
3899                         input_queue_head_incr(sd);
3900                 }
3901         }
3902 }
3903
3904 static int napi_gro_complete(struct sk_buff *skb)
3905 {
3906         struct packet_offload *ptype;
3907         __be16 type = skb->protocol;
3908         struct list_head *head = &offload_base;
3909         int err = -ENOENT;
3910
3911         BUILD_BUG_ON(sizeof(struct napi_gro_cb) > sizeof(skb->cb));
3912
3913         if (NAPI_GRO_CB(skb)->count == 1) {
3914                 skb_shinfo(skb)->gso_size = 0;
3915                 goto out;
3916         }
3917
3918         rcu_read_lock();
3919         list_for_each_entry_rcu(ptype, head, list) {
3920                 if (ptype->type != type || !ptype->callbacks.gro_complete)
3921                         continue;
3922
3923                 err = ptype->callbacks.gro_complete(skb, 0);
3924                 break;
3925         }
3926         rcu_read_unlock();
3927
3928         if (err) {
3929                 WARN_ON(&ptype->list == head);
3930                 kfree_skb(skb);
3931                 return NET_RX_SUCCESS;
3932         }
3933
3934 out:
3935         return netif_receive_skb_internal(skb);
3936 }
3937
3938 /* napi->gro_list contains packets ordered by age.
3939  * youngest packets at the head of it.
3940  * Complete skbs in reverse order to reduce latencies.
3941  */
3942 void napi_gro_flush(struct napi_struct *napi, bool flush_old)
3943 {
3944         struct sk_buff *skb, *prev = NULL;
3945
3946         /* scan list and build reverse chain */
3947         for (skb = napi->gro_list; skb != NULL; skb = skb->next) {
3948                 skb->prev = prev;
3949                 prev = skb;
3950         }
3951
3952         for (skb = prev; skb; skb = prev) {
3953                 skb->next = NULL;
3954
3955                 if (flush_old && NAPI_GRO_CB(skb)->age == jiffies)
3956                         return;
3957
3958                 prev = skb->prev;
3959                 napi_gro_complete(skb);
3960                 napi->gro_count--;
3961         }
3962
3963         napi->gro_list = NULL;
3964 }
3965 EXPORT_SYMBOL(napi_gro_flush);
3966
3967 static void gro_list_prepare(struct napi_struct *napi, struct sk_buff *skb)
3968 {
3969         struct sk_buff *p;
3970         unsigned int maclen = skb->dev->hard_header_len;
3971         u32 hash = skb_get_hash_raw(skb);
3972
3973         for (p = napi->gro_list; p; p = p->next) {
3974                 unsigned long diffs;
3975
3976                 NAPI_GRO_CB(p)->flush = 0;
3977
3978                 if (hash != skb_get_hash_raw(p)) {
3979                         NAPI_GRO_CB(p)->same_flow = 0;
3980                         continue;
3981                 }
3982
3983                 diffs = (unsigned long)p->dev ^ (unsigned long)skb->dev;
3984                 diffs |= p->vlan_tci ^ skb->vlan_tci;
3985                 if (maclen == ETH_HLEN)
3986                         diffs |= compare_ether_header(skb_mac_header(p),
3987                                                       skb_mac_header(skb));
3988                 else if (!diffs)
3989                         diffs = memcmp(skb_mac_header(p),
3990                                        skb_mac_header(skb),
3991                                        maclen);
3992                 NAPI_GRO_CB(p)->same_flow = !diffs;
3993         }
3994 }
3995
3996 static void skb_gro_reset_offset(struct sk_buff *skb)
3997 {
3998         const struct skb_shared_info *pinfo = skb_shinfo(skb);
3999         const skb_frag_t *frag0 = &pinfo->frags[0];
4000
4001         NAPI_GRO_CB(skb)->data_offset = 0;
4002         NAPI_GRO_CB(skb)->frag0 = NULL;
4003         NAPI_GRO_CB(skb)->frag0_len = 0;
4004
4005         if (skb_mac_header(skb) == skb_tail_pointer(skb) &&
4006             pinfo->nr_frags &&
4007             !PageHighMem(skb_frag_page(frag0))) {
4008                 NAPI_GRO_CB(skb)->frag0 = skb_frag_address(frag0);
4009                 NAPI_GRO_CB(skb)->frag0_len = skb_frag_size(frag0);
4010         }
4011 }
4012
4013 static void gro_pull_from_frag0(struct sk_buff *skb, int grow)
4014 {
4015         struct skb_shared_info *pinfo = skb_shinfo(skb);
4016
4017         BUG_ON(skb->end - skb->tail < grow);
4018
4019         memcpy(skb_tail_pointer(skb), NAPI_GRO_CB(skb)->frag0, grow);
4020
4021         skb->data_len -= grow;
4022         skb->tail += grow;
4023
4024         pinfo->frags[0].page_offset += grow;
4025         skb_frag_size_sub(&pinfo->frags[0], grow);
4026
4027         if (unlikely(!skb_frag_size(&pinfo->frags[0]))) {
4028                 skb_frag_unref(skb, 0);
4029                 memmove(pinfo->frags, pinfo->frags + 1,
4030                         --pinfo->nr_frags * sizeof(pinfo->frags[0]));
4031         }
4032 }
4033
4034 static enum gro_result dev_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
4035 {
4036         struct sk_buff **pp = NULL;
4037         struct packet_offload *ptype;
4038         __be16 type = skb->protocol;
4039         struct list_head *head = &offload_base;
4040         int same_flow;
4041         enum gro_result ret;
4042         int grow;
4043
4044         if (!(skb->dev->features & NETIF_F_GRO))
4045                 goto normal;
4046
4047         if (skb_is_gso(skb) || skb_has_frag_list(skb) || skb->csum_bad)
4048                 goto normal;
4049
4050         gro_list_prepare(napi, skb);
4051
4052         rcu_read_lock();
4053         list_for_each_entry_rcu(ptype, head, list) {
4054                 if (ptype->type != type || !ptype->callbacks.gro_receive)
4055                         continue;
4056
4057                 skb_set_network_header(skb, skb_gro_offset(skb));
4058                 skb_reset_mac_len(skb);
4059                 NAPI_GRO_CB(skb)->same_flow = 0;
4060                 NAPI_GRO_CB(skb)->flush = 0;
4061                 NAPI_GRO_CB(skb)->free = 0;
4062                 NAPI_GRO_CB(skb)->recursion_counter = 0;
4063                 NAPI_GRO_CB(skb)->encap_mark = 0;
4064                 NAPI_GRO_CB(skb)->gro_remcsum_start = 0;
4065
4066                 /* Setup for GRO checksum validation */
4067                 switch (skb->ip_summed) {
4068                 case CHECKSUM_COMPLETE:
4069                         NAPI_GRO_CB(skb)->csum = skb->csum;
4070                         NAPI_GRO_CB(skb)->csum_valid = 1;
4071                         NAPI_GRO_CB(skb)->csum_cnt = 0;
4072                         break;
4073                 case CHECKSUM_UNNECESSARY:
4074                         NAPI_GRO_CB(skb)->csum_cnt = skb->csum_level + 1;
4075                         NAPI_GRO_CB(skb)->csum_valid = 0;
4076                         break;
4077                 default:
4078                         NAPI_GRO_CB(skb)->csum_cnt = 0;
4079                         NAPI_GRO_CB(skb)->csum_valid = 0;
4080                 }
4081
4082                 pp = ptype->callbacks.gro_receive(&napi->gro_list, skb);
4083                 break;
4084         }
4085         rcu_read_unlock();
4086
4087         if (&ptype->list == head)
4088                 goto normal;
4089
4090         same_flow = NAPI_GRO_CB(skb)->same_flow;
4091         ret = NAPI_GRO_CB(skb)->free ? GRO_MERGED_FREE : GRO_MERGED;
4092
4093         if (pp) {
4094                 struct sk_buff *nskb = *pp;
4095
4096                 *pp = nskb->next;
4097                 nskb->next = NULL;
4098                 napi_gro_complete(nskb);
4099                 napi->gro_count--;
4100         }
4101
4102         if (same_flow)
4103                 goto ok;
4104
4105         if (NAPI_GRO_CB(skb)->flush)
4106                 goto normal;
4107
4108         if (unlikely(napi->gro_count >= MAX_GRO_SKBS)) {
4109                 struct sk_buff *nskb = napi->gro_list;
4110
4111                 /* locate the end of the list to select the 'oldest' flow */
4112                 while (nskb->next) {
4113                         pp = &nskb->next;
4114                         nskb = *pp;
4115                 }
4116                 *pp = NULL;
4117                 nskb->next = NULL;
4118                 napi_gro_complete(nskb);
4119         } else {
4120                 napi->gro_count++;
4121         }
4122         NAPI_GRO_CB(skb)->count = 1;
4123         NAPI_GRO_CB(skb)->age = jiffies;
4124         NAPI_GRO_CB(skb)->last = skb;
4125         skb_shinfo(skb)->gso_size = skb_gro_len(skb);
4126         skb->next = napi->gro_list;
4127         napi->gro_list = skb;
4128         ret = GRO_HELD;
4129
4130 pull:
4131         grow = skb_gro_offset(skb) - skb_headlen(skb);
4132         if (grow > 0)
4133                 gro_pull_from_frag0(skb, grow);
4134 ok:
4135         return ret;
4136
4137 normal:
4138         ret = GRO_NORMAL;
4139         goto pull;
4140 }
4141
4142 struct packet_offload *gro_find_receive_by_type(__be16 type)
4143 {
4144         struct list_head *offload_head = &offload_base;
4145         struct packet_offload *ptype;
4146
4147         list_for_each_entry_rcu(ptype, offload_head, list) {
4148                 if (ptype->type != type || !ptype->callbacks.gro_receive)
4149                         continue;
4150                 return ptype;
4151         }
4152         return NULL;
4153 }
4154 EXPORT_SYMBOL(gro_find_receive_by_type);
4155
4156 struct packet_offload *gro_find_complete_by_type(__be16 type)
4157 {
4158         struct list_head *offload_head = &offload_base;
4159         struct packet_offload *ptype;
4160
4161         list_for_each_entry_rcu(ptype, offload_head, list) {
4162                 if (ptype->type != type || !ptype->callbacks.gro_complete)
4163                         continue;
4164                 return ptype;
4165         }
4166         return NULL;
4167 }
4168 EXPORT_SYMBOL(gro_find_complete_by_type);
4169
4170 static gro_result_t napi_skb_finish(gro_result_t ret, struct sk_buff *skb)
4171 {
4172         switch (ret) {
4173         case GRO_NORMAL:
4174                 if (netif_receive_skb_internal(skb))
4175                         ret = GRO_DROP;
4176                 break;
4177
4178         case GRO_DROP:
4179                 kfree_skb(skb);
4180                 break;
4181
4182         case GRO_MERGED_FREE:
4183                 if (NAPI_GRO_CB(skb)->free == NAPI_GRO_FREE_STOLEN_HEAD)
4184                         kmem_cache_free(skbuff_head_cache, skb);
4185                 else
4186                         __kfree_skb(skb);
4187                 break;
4188
4189         case GRO_HELD:
4190         case GRO_MERGED:
4191                 break;
4192         }
4193
4194         return ret;
4195 }
4196
4197 gro_result_t napi_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
4198 {
4199         trace_napi_gro_receive_entry(skb);
4200
4201         skb_gro_reset_offset(skb);
4202
4203         return napi_skb_finish(dev_gro_receive(napi, skb), skb);
4204 }
4205 EXPORT_SYMBOL(napi_gro_receive);
4206
4207 static void napi_reuse_skb(struct napi_struct *napi, struct sk_buff *skb)
4208 {
4209         if (unlikely(skb->pfmemalloc)) {
4210                 consume_skb(skb);
4211                 return;
4212         }
4213         __skb_pull(skb, skb_headlen(skb));
4214         /* restore the reserve we had after netdev_alloc_skb_ip_align() */
4215         skb_reserve(skb, NET_SKB_PAD + NET_IP_ALIGN - skb_headroom(skb));
4216         skb->vlan_tci = 0;
4217         skb->dev = napi->dev;
4218         skb->skb_iif = 0;
4219         skb->encapsulation = 0;
4220         skb_shinfo(skb)->gso_type = 0;
4221         skb->truesize = SKB_TRUESIZE(skb_end_offset(skb));
4222
4223         napi->skb = skb;
4224 }
4225
4226 struct sk_buff *napi_get_frags(struct napi_struct *napi)
4227 {
4228         struct sk_buff *skb = napi->skb;
4229
4230         if (!skb) {
4231                 skb = napi_alloc_skb(napi, GRO_MAX_HEAD);
4232                 napi->skb = skb;
4233         }
4234         return skb;
4235 }
4236 EXPORT_SYMBOL(napi_get_frags);
4237
4238 static gro_result_t napi_frags_finish(struct napi_struct *napi,
4239                                       struct sk_buff *skb,
4240                                       gro_result_t ret)
4241 {
4242         switch (ret) {
4243         case GRO_NORMAL:
4244         case GRO_HELD:
4245                 __skb_push(skb, ETH_HLEN);
4246                 skb->protocol = eth_type_trans(skb, skb->dev);
4247                 if (ret == GRO_NORMAL && netif_receive_skb_internal(skb))
4248                         ret = GRO_DROP;
4249                 break;
4250
4251         case GRO_DROP:
4252         case GRO_MERGED_FREE:
4253                 napi_reuse_skb(napi, skb);
4254                 break;
4255
4256         case GRO_MERGED:
4257                 break;
4258         }
4259
4260         return ret;
4261 }
4262
4263 /* Upper GRO stack assumes network header starts at gro_offset=0
4264  * Drivers could call both napi_gro_frags() and napi_gro_receive()
4265  * We copy ethernet header into skb->data to have a common layout.
4266  */
4267 static struct sk_buff *napi_frags_skb(struct napi_struct *napi)
4268 {
4269         struct sk_buff *skb = napi->skb;
4270         const struct ethhdr *eth;
4271         unsigned int hlen = sizeof(*eth);
4272
4273         napi->skb = NULL;
4274
4275         skb_reset_mac_header(skb);
4276         skb_gro_reset_offset(skb);
4277
4278         eth = skb_gro_header_fast(skb, 0);
4279         if (unlikely(skb_gro_header_hard(skb, hlen))) {
4280                 eth = skb_gro_header_slow(skb, hlen, 0);
4281                 if (unlikely(!eth)) {
4282                         napi_reuse_skb(napi, skb);
4283                         return NULL;
4284                 }
4285         } else {
4286                 gro_pull_from_frag0(skb, hlen);
4287                 NAPI_GRO_CB(skb)->frag0 += hlen;
4288                 NAPI_GRO_CB(skb)->frag0_len -= hlen;
4289         }
4290         __skb_pull(skb, hlen);
4291
4292         /*
4293          * This works because the only protocols we care about don't require
4294          * special handling.
4295          * We'll fix it up properly in napi_frags_finish()
4296          */
4297         skb->protocol = eth->h_proto;
4298
4299         return skb;
4300 }
4301
4302 gro_result_t napi_gro_frags(struct napi_struct *napi)
4303 {
4304         struct sk_buff *skb = napi_frags_skb(napi);
4305
4306         if (!skb)
4307                 return GRO_DROP;
4308
4309         trace_napi_gro_frags_entry(skb);
4310
4311         return napi_frags_finish(napi, skb, dev_gro_receive(napi, skb));
4312 }
4313 EXPORT_SYMBOL(napi_gro_frags);
4314
4315 /* Compute the checksum from gro_offset and return the folded value
4316  * after adding in any pseudo checksum.
4317  */
4318 __sum16 __skb_gro_checksum_complete(struct sk_buff *skb)
4319 {
4320         __wsum wsum;
4321         __sum16 sum;
4322
4323         wsum = skb_checksum(skb, skb_gro_offset(skb), skb_gro_len(skb), 0);
4324
4325         /* NAPI_GRO_CB(skb)->csum holds pseudo checksum */
4326         sum = csum_fold(csum_add(NAPI_GRO_CB(skb)->csum, wsum));
4327         if (likely(!sum)) {
4328                 if (unlikely(skb->ip_summed == CHECKSUM_COMPLETE) &&
4329                     !skb->csum_complete_sw)
4330                         netdev_rx_csum_fault(skb->dev);
4331         }
4332
4333         NAPI_GRO_CB(skb)->csum = wsum;
4334         NAPI_GRO_CB(skb)->csum_valid = 1;
4335
4336         return sum;
4337 }
4338 EXPORT_SYMBOL(__skb_gro_checksum_complete);
4339
4340 /*
4341  * net_rps_action_and_irq_enable sends any pending IPI's for rps.
4342  * Note: called with local irq disabled, but exits with local irq enabled.
4343  */
4344 static void net_rps_action_and_irq_enable(struct softnet_data *sd)
4345 {
4346 #ifdef CONFIG_RPS
4347         struct softnet_data *remsd = sd->rps_ipi_list;
4348
4349         if (remsd) {
4350                 sd->rps_ipi_list = NULL;
4351
4352                 local_irq_enable();
4353
4354                 /* Send pending IPI's to kick RPS processing on remote cpus. */
4355                 while (remsd) {
4356                         struct softnet_data *next = remsd->rps_ipi_next;
4357
4358                         if (cpu_online(remsd->cpu))
4359                                 smp_call_function_single_async(remsd->cpu,
4360                                                            &remsd->csd);
4361                         remsd = next;
4362                 }
4363         } else
4364 #endif
4365                 local_irq_enable();
4366 }
4367
4368 static bool sd_has_rps_ipi_waiting(struct softnet_data *sd)
4369 {
4370 #ifdef CONFIG_RPS
4371         return sd->rps_ipi_list != NULL;
4372 #else
4373         return false;
4374 #endif
4375 }
4376
4377 static int process_backlog(struct napi_struct *napi, int quota)
4378 {
4379         int work = 0;
4380         struct softnet_data *sd = container_of(napi, struct softnet_data, backlog);
4381
4382         /* Check if we have pending ipi, its better to send them now,
4383          * not waiting net_rx_action() end.
4384          */
4385         if (sd_has_rps_ipi_waiting(sd)) {
4386                 local_irq_disable();
4387                 net_rps_action_and_irq_enable(sd);
4388         }
4389
4390         napi->weight = weight_p;
4391         local_irq_disable();
4392         while (1) {
4393                 struct sk_buff *skb;
4394
4395                 while ((skb = __skb_dequeue(&sd->process_queue))) {
4396                         rcu_read_lock();
4397                         local_irq_enable();
4398                         __netif_receive_skb(skb);
4399                         rcu_read_unlock();
4400                         local_irq_disable();
4401                         input_queue_head_incr(sd);
4402                         if (++work >= quota) {
4403                                 local_irq_enable();
4404                                 return work;
4405                         }
4406                 }
4407
4408                 rps_lock(sd);
4409                 if (skb_queue_empty(&sd->input_pkt_queue)) {
4410                         /*
4411                          * Inline a custom version of __napi_complete().
4412                          * only current cpu owns and manipulates this napi,
4413                          * and NAPI_STATE_SCHED is the only possible flag set
4414                          * on backlog.
4415                          * We can use a plain write instead of clear_bit(),
4416                          * and we dont need an smp_mb() memory barrier.
4417                          */
4418                         napi->state = 0;
4419                         rps_unlock(sd);
4420
4421                         break;
4422                 }
4423
4424                 skb_queue_splice_tail_init(&sd->input_pkt_queue,
4425                                            &sd->process_queue);
4426                 rps_unlock(sd);
4427         }
4428         local_irq_enable();
4429
4430         return work;
4431 }
4432
4433 /**
4434  * __napi_schedule - schedule for receive
4435  * @n: entry to schedule
4436  *
4437  * The entry's receive function will be scheduled to run.
4438  * Consider using __napi_schedule_irqoff() if hard irqs are masked.
4439  */
4440 void __napi_schedule(struct napi_struct *n)
4441 {
4442         unsigned long flags;
4443
4444         local_irq_save(flags);
4445         ____napi_schedule(this_cpu_ptr(&softnet_data), n);
4446         local_irq_restore(flags);
4447 }
4448 EXPORT_SYMBOL(__napi_schedule);
4449
4450 /**
4451  * __napi_schedule_irqoff - schedule for receive
4452  * @n: entry to schedule
4453  *
4454  * Variant of __napi_schedule() assuming hard irqs are masked
4455  */
4456 void __napi_schedule_irqoff(struct napi_struct *n)
4457 {
4458         ____napi_schedule(this_cpu_ptr(&softnet_data), n);
4459 }
4460 EXPORT_SYMBOL(__napi_schedule_irqoff);
4461
4462 void __napi_complete(struct napi_struct *n)
4463 {
4464         BUG_ON(!test_bit(NAPI_STATE_SCHED, &n->state));
4465
4466         list_del_init(&n->poll_list);
4467         smp_mb__before_atomic();
4468         clear_bit(NAPI_STATE_SCHED, &n->state);
4469 }
4470 EXPORT_SYMBOL(__napi_complete);
4471
4472 void napi_complete_done(struct napi_struct *n, int work_done)
4473 {
4474         unsigned long flags;
4475
4476         /*
4477          * don't let napi dequeue from the cpu poll list
4478          * just in case its running on a different cpu
4479          */
4480         if (unlikely(test_bit(NAPI_STATE_NPSVC, &n->state)))
4481                 return;
4482
4483         if (n->gro_list) {
4484                 unsigned long timeout = 0;
4485
4486                 if (work_done)
4487                         timeout = n->dev->gro_flush_timeout;
4488
4489                 if (timeout)
4490                         hrtimer_start(&n->timer, ns_to_ktime(timeout),
4491                                       HRTIMER_MODE_REL_PINNED);
4492                 else
4493                         napi_gro_flush(n, false);
4494         }
4495         if (likely(list_empty(&n->poll_list))) {
4496                 WARN_ON_ONCE(!test_and_clear_bit(NAPI_STATE_SCHED, &n->state));
4497         } else {
4498                 /* If n->poll_list is not empty, we need to mask irqs */
4499                 local_irq_save(flags);
4500                 __napi_complete(n);
4501                 local_irq_restore(flags);
4502         }
4503 }
4504 EXPORT_SYMBOL(napi_complete_done);
4505
4506 /* must be called under rcu_read_lock(), as we dont take a reference */
4507 struct napi_struct *napi_by_id(unsigned int napi_id)
4508 {
4509         unsigned int hash = napi_id % HASH_SIZE(napi_hash);
4510         struct napi_struct *napi;
4511
4512         hlist_for_each_entry_rcu(napi, &napi_hash[hash], napi_hash_node)
4513                 if (napi->napi_id == napi_id)
4514                         return napi;
4515
4516         return NULL;
4517 }
4518 EXPORT_SYMBOL_GPL(napi_by_id);
4519
4520 void napi_hash_add(struct napi_struct *napi)
4521 {
4522         if (!test_and_set_bit(NAPI_STATE_HASHED, &napi->state)) {
4523
4524                 spin_lock(&napi_hash_lock);
4525
4526                 /* 0 is not a valid id, we also skip an id that is taken
4527                  * we expect both events to be extremely rare
4528                  */
4529                 napi->napi_id = 0;
4530                 while (!napi->napi_id) {
4531                         napi->napi_id = ++napi_gen_id;
4532                         if (napi_by_id(napi->napi_id))
4533                                 napi->napi_id = 0;
4534                 }
4535
4536                 hlist_add_head_rcu(&napi->napi_hash_node,
4537                         &napi_hash[napi->napi_id % HASH_SIZE(napi_hash)]);
4538
4539                 spin_unlock(&napi_hash_lock);
4540         }
4541 }
4542 EXPORT_SYMBOL_GPL(napi_hash_add);
4543
4544 /* Warning : caller is responsible to make sure rcu grace period
4545  * is respected before freeing memory containing @napi
4546  */
4547 void napi_hash_del(struct napi_struct *napi)
4548 {
4549         spin_lock(&napi_hash_lock);
4550
4551         if (test_and_clear_bit(NAPI_STATE_HASHED, &napi->state))
4552                 hlist_del_rcu(&napi->napi_hash_node);
4553
4554         spin_unlock(&napi_hash_lock);
4555 }
4556 EXPORT_SYMBOL_GPL(napi_hash_del);
4557
4558 static enum hrtimer_restart napi_watchdog(struct hrtimer *timer)
4559 {
4560         struct napi_struct *napi;
4561
4562         napi = container_of(timer, struct napi_struct, timer);
4563         if (napi->gro_list)
4564                 napi_schedule(napi);
4565
4566         return HRTIMER_NORESTART;
4567 }
4568
4569 void netif_napi_add(struct net_device *dev, struct napi_struct *napi,
4570                     int (*poll)(struct napi_struct *, int), int weight)
4571 {
4572         INIT_LIST_HEAD(&napi->poll_list);
4573         hrtimer_init(&napi->timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL_PINNED);
4574         napi->timer.function = napi_watchdog;
4575         napi->gro_count = 0;
4576         napi->gro_list = NULL;
4577         napi->skb = NULL;
4578         napi->poll = poll;
4579         if (weight > NAPI_POLL_WEIGHT)
4580                 pr_err_once("netif_napi_add() called with weight %d on device %s\n",
4581                             weight, dev->name);
4582         napi->weight = weight;
4583         list_add(&napi->dev_list, &dev->napi_list);
4584         napi->dev = dev;
4585 #ifdef CONFIG_NETPOLL
4586         spin_lock_init(&napi->poll_lock);
4587         napi->poll_owner = -1;
4588 #endif
4589         set_bit(NAPI_STATE_SCHED, &napi->state);
4590 }
4591 EXPORT_SYMBOL(netif_napi_add);
4592
4593 void napi_disable(struct napi_struct *n)
4594 {
4595         might_sleep();
4596         set_bit(NAPI_STATE_DISABLE, &n->state);
4597
4598         while (test_and_set_bit(NAPI_STATE_SCHED, &n->state))
4599                 msleep(1);
4600
4601         hrtimer_cancel(&n->timer);
4602
4603         clear_bit(NAPI_STATE_DISABLE, &n->state);
4604 }
4605 EXPORT_SYMBOL(napi_disable);
4606
4607 void netif_napi_del(struct napi_struct *napi)
4608 {
4609         list_del_init(&napi->dev_list);
4610         napi_free_frags(napi);
4611
4612         kfree_skb_list(napi->gro_list);
4613         napi->gro_list = NULL;
4614         napi->gro_count = 0;
4615 }
4616 EXPORT_SYMBOL(netif_napi_del);
4617
4618 static int napi_poll(struct napi_struct *n, struct list_head *repoll)
4619 {
4620         void *have;
4621         int work, weight;
4622
4623         list_del_init(&n->poll_list);
4624
4625         have = netpoll_poll_lock(n);
4626
4627         weight = n->weight;
4628
4629         /* This NAPI_STATE_SCHED test is for avoiding a race
4630          * with netpoll's poll_napi().  Only the entity which
4631          * obtains the lock and sees NAPI_STATE_SCHED set will
4632          * actually make the ->poll() call.  Therefore we avoid
4633          * accidentally calling ->poll() when NAPI is not scheduled.
4634          */
4635         work = 0;
4636         if (test_bit(NAPI_STATE_SCHED, &n->state)) {
4637                 work = n->poll(n, weight);
4638                 trace_napi_poll(n);
4639         }
4640
4641         WARN_ON_ONCE(work > weight);
4642
4643         if (likely(work < weight))
4644                 goto out_unlock;
4645
4646         /* Drivers must not modify the NAPI state if they
4647          * consume the entire weight.  In such cases this code
4648          * still "owns" the NAPI instance and therefore can
4649          * move the instance around on the list at-will.
4650          */
4651         if (unlikely(napi_disable_pending(n))) {
4652                 napi_complete(n);
4653                 goto out_unlock;
4654         }
4655
4656         if (n->gro_list) {
4657                 /* flush too old packets
4658                  * If HZ < 1000, flush all packets.
4659                  */
4660                 napi_gro_flush(n, HZ >= 1000);
4661         }
4662
4663         /* Some drivers may have called napi_schedule
4664          * prior to exhausting their budget.
4665          */
4666         if (unlikely(!list_empty(&n->poll_list))) {
4667                 pr_warn_once("%s: Budget exhausted after napi rescheduled\n",
4668                              n->dev ? n->dev->name : "backlog");
4669                 goto out_unlock;
4670         }
4671
4672         list_add_tail(&n->poll_list, repoll);
4673
4674 out_unlock:
4675         netpoll_poll_unlock(have);
4676
4677         return work;
4678 }
4679
4680 static void net_rx_action(struct softirq_action *h)
4681 {
4682         struct softnet_data *sd = this_cpu_ptr(&softnet_data);
4683         unsigned long time_limit = jiffies + 2;
4684         int budget = netdev_budget;
4685         LIST_HEAD(list);
4686         LIST_HEAD(repoll);
4687
4688         local_irq_disable();
4689         list_splice_init(&sd->poll_list, &list);
4690         local_irq_enable();
4691
4692         for (;;) {
4693                 struct napi_struct *n;
4694
4695                 if (list_empty(&list)) {
4696                         if (!sd_has_rps_ipi_waiting(sd) && list_empty(&repoll))
4697                                 return;
4698                         break;
4699                 }
4700
4701                 n = list_first_entry(&list, struct napi_struct, poll_list);
4702                 budget -= napi_poll(n, &repoll);
4703
4704                 /* If softirq window is exhausted then punt.
4705                  * Allow this to run for 2 jiffies since which will allow
4706                  * an average latency of 1.5/HZ.
4707                  */
4708                 if (unlikely(budget <= 0 ||
4709                              time_after_eq(jiffies, time_limit))) {
4710                         sd->time_squeeze++;
4711                         break;
4712                 }
4713         }
4714
4715         local_irq_disable();
4716
4717         list_splice_tail_init(&sd->poll_list, &list);
4718         list_splice_tail(&repoll, &list);
4719         list_splice(&list, &sd->poll_list);
4720         if (!list_empty(&sd->poll_list))
4721                 __raise_softirq_irqoff(NET_RX_SOFTIRQ);
4722
4723         net_rps_action_and_irq_enable(sd);
4724 }
4725
4726 struct netdev_adjacent {
4727         struct net_device *dev;
4728
4729         /* upper master flag, there can only be one master device per list */
4730         bool master;
4731
4732         /* counter for the number of times this device was added to us */
4733         u16 ref_nr;
4734
4735         /* private field for the users */
4736         void *private;
4737
4738         struct list_head list;
4739         struct rcu_head rcu;
4740 };
4741
4742 static struct netdev_adjacent *__netdev_find_adj(struct net_device *dev,
4743                                                  struct net_device *adj_dev,
4744                                                  struct list_head *adj_list)
4745 {
4746         struct netdev_adjacent *adj;
4747
4748         list_for_each_entry(adj, adj_list, list) {
4749                 if (adj->dev == adj_dev)
4750                         return adj;
4751         }
4752         return NULL;
4753 }
4754
4755 /**
4756  * netdev_has_upper_dev - Check if device is linked to an upper device
4757  * @dev: device
4758  * @upper_dev: upper device to check
4759  *
4760  * Find out if a device is linked to specified upper device and return true
4761  * in case it is. Note that this checks only immediate upper device,
4762  * not through a complete stack of devices. The caller must hold the RTNL lock.
4763  */
4764 bool netdev_has_upper_dev(struct net_device *dev,
4765                           struct net_device *upper_dev)
4766 {
4767         ASSERT_RTNL();
4768
4769         return __netdev_find_adj(dev, upper_dev, &dev->all_adj_list.upper);
4770 }
4771 EXPORT_SYMBOL(netdev_has_upper_dev);
4772
4773 /**
4774  * netdev_has_any_upper_dev - Check if device is linked to some device
4775  * @dev: device
4776  *
4777  * Find out if a device is linked to an upper device and return true in case
4778  * it is. The caller must hold the RTNL lock.
4779  */
4780 static bool netdev_has_any_upper_dev(struct net_device *dev)
4781 {
4782         ASSERT_RTNL();
4783
4784         return !list_empty(&dev->all_adj_list.upper);
4785 }
4786
4787 /**
4788  * netdev_master_upper_dev_get - Get master upper device
4789  * @dev: device
4790  *
4791  * Find a master upper device and return pointer to it or NULL in case
4792  * it's not there. The caller must hold the RTNL lock.
4793  */
4794 struct net_device *netdev_master_upper_dev_get(struct net_device *dev)
4795 {
4796         struct netdev_adjacent *upper;
4797
4798         ASSERT_RTNL();
4799
4800         if (list_empty(&dev->adj_list.upper))
4801                 return NULL;
4802
4803         upper = list_first_entry(&dev->adj_list.upper,
4804                                  struct netdev_adjacent, list);
4805         if (likely(upper->master))
4806                 return upper->dev;
4807         return NULL;
4808 }
4809 EXPORT_SYMBOL(netdev_master_upper_dev_get);
4810
4811 void *netdev_adjacent_get_private(struct list_head *adj_list)
4812 {
4813         struct netdev_adjacent *adj;
4814
4815         adj = list_entry(adj_list, struct netdev_adjacent, list);
4816
4817         return adj->private;
4818 }
4819 EXPORT_SYMBOL(netdev_adjacent_get_private);
4820
4821 /**
4822  * netdev_upper_get_next_dev_rcu - Get the next dev from upper list
4823  * @dev: device
4824  * @iter: list_head ** of the current position
4825  *
4826  * Gets the next device from the dev's upper list, starting from iter
4827  * position. The caller must hold RCU read lock.
4828  */
4829 struct net_device *netdev_upper_get_next_dev_rcu(struct net_device *dev,
4830                                                  struct list_head **iter)
4831 {
4832         struct netdev_adjacent *upper;
4833
4834         WARN_ON_ONCE(!rcu_read_lock_held() && !lockdep_rtnl_is_held());
4835
4836         upper = list_entry_rcu((*iter)->next, struct netdev_adjacent, list);
4837
4838         if (&upper->list == &dev->adj_list.upper)
4839                 return NULL;
4840
4841         *iter = &upper->list;
4842
4843         return upper->dev;
4844 }
4845 EXPORT_SYMBOL(netdev_upper_get_next_dev_rcu);
4846
4847 /**
4848  * netdev_all_upper_get_next_dev_rcu - Get the next dev from upper list
4849  * @dev: device
4850  * @iter: list_head ** of the current position
4851  *
4852  * Gets the next device from the dev's upper list, starting from iter
4853  * position. The caller must hold RCU read lock.
4854  */
4855 struct net_device *netdev_all_upper_get_next_dev_rcu(struct net_device *dev,
4856                                                      struct list_head **iter)
4857 {
4858         struct netdev_adjacent *upper;
4859
4860         WARN_ON_ONCE(!rcu_read_lock_held() && !lockdep_rtnl_is_held());
4861
4862         upper = list_entry_rcu((*iter)->next, struct netdev_adjacent, list);
4863
4864         if (&upper->list == &dev->all_adj_list.upper)
4865                 return NULL;
4866
4867         *iter = &upper->list;
4868
4869         return upper->dev;
4870 }
4871 EXPORT_SYMBOL(netdev_all_upper_get_next_dev_rcu);
4872
4873 /**
4874  * netdev_lower_get_next_private - Get the next ->private from the
4875  *                                 lower neighbour list
4876  * @dev: device
4877  * @iter: list_head ** of the current position
4878  *
4879  * Gets the next netdev_adjacent->private from the dev's lower neighbour
4880  * list, starting from iter position. The caller must hold either hold the
4881  * RTNL lock or its own locking that guarantees that the neighbour lower
4882  * list will remain unchainged.
4883  */
4884 void *netdev_lower_get_next_private(struct net_device *dev,
4885                                     struct list_head **iter)
4886 {
4887         struct netdev_adjacent *lower;
4888
4889         lower = list_entry(*iter, struct netdev_adjacent, list);
4890
4891         if (&lower->list == &dev->adj_list.lower)
4892                 return NULL;
4893
4894         *iter = lower->list.next;
4895
4896         return lower->private;
4897 }
4898 EXPORT_SYMBOL(netdev_lower_get_next_private);
4899
4900 /**
4901  * netdev_lower_get_next_private_rcu - Get the next ->private from the
4902  *                                     lower neighbour list, RCU
4903  *                                     variant
4904  * @dev: device
4905  * @iter: list_head ** of the current position
4906  *
4907  * Gets the next netdev_adjacent->private from the dev's lower neighbour
4908  * list, starting from iter position. The caller must hold RCU read lock.
4909  */
4910 void *netdev_lower_get_next_private_rcu(struct net_device *dev,
4911                                         struct list_head **iter)
4912 {
4913         struct netdev_adjacent *lower;
4914
4915         WARN_ON_ONCE(!rcu_read_lock_held());
4916
4917         lower = list_entry_rcu((*iter)->next, struct netdev_adjacent, list);
4918
4919         if (&lower->list == &dev->adj_list.lower)
4920                 return NULL;
4921
4922         *iter = &lower->list;
4923
4924         return lower->private;
4925 }
4926 EXPORT_SYMBOL(netdev_lower_get_next_private_rcu);
4927
4928 /**
4929  * netdev_lower_get_next - Get the next device from the lower neighbour
4930  *                         list
4931  * @dev: device
4932  * @iter: list_head ** of the current position
4933  *
4934  * Gets the next netdev_adjacent from the dev's lower neighbour
4935  * list, starting from iter position. The caller must hold RTNL lock or
4936  * its own locking that guarantees that the neighbour lower
4937  * list will remain unchainged.
4938  */
4939 void *netdev_lower_get_next(struct net_device *dev, struct list_head **iter)
4940 {
4941         struct netdev_adjacent *lower;
4942
4943         lower = list_entry((*iter)->next, struct netdev_adjacent, list);
4944
4945         if (&lower->list == &dev->adj_list.lower)
4946                 return NULL;
4947
4948         *iter = &lower->list;
4949
4950         return lower->dev;
4951 }
4952 EXPORT_SYMBOL(netdev_lower_get_next);
4953
4954 /**
4955  * netdev_lower_get_first_private_rcu - Get the first ->private from the
4956  *                                     lower neighbour list, RCU
4957  *                                     variant
4958  * @dev: device
4959  *
4960  * Gets the first netdev_adjacent->private from the dev's lower neighbour
4961  * list. The caller must hold RCU read lock.
4962  */
4963 void *netdev_lower_get_first_private_rcu(struct net_device *dev)
4964 {
4965         struct netdev_adjacent *lower;
4966
4967         lower = list_first_or_null_rcu(&dev->adj_list.lower,
4968                         struct netdev_adjacent, list);
4969         if (lower)
4970                 return lower->private;
4971         return NULL;
4972 }
4973 EXPORT_SYMBOL(netdev_lower_get_first_private_rcu);
4974
4975 /**
4976  * netdev_master_upper_dev_get_rcu - Get master upper device
4977  * @dev: device
4978  *
4979  * Find a master upper device and return pointer to it or NULL in case
4980  * it's not there. The caller must hold the RCU read lock.
4981  */
4982 struct net_device *netdev_master_upper_dev_get_rcu(struct net_device *dev)
4983 {
4984         struct netdev_adjacent *upper;
4985
4986         upper = list_first_or_null_rcu(&dev->adj_list.upper,
4987                                        struct netdev_adjacent, list);
4988         if (upper && likely(upper->master))
4989                 return upper->dev;
4990         return NULL;
4991 }
4992 EXPORT_SYMBOL(netdev_master_upper_dev_get_rcu);
4993
4994 static int netdev_adjacent_sysfs_add(struct net_device *dev,
4995                               struct net_device *adj_dev,
4996                               struct list_head *dev_list)
4997 {
4998         char linkname[IFNAMSIZ+7];
4999         sprintf(linkname, dev_list == &dev->adj_list.upper ?
5000                 "upper_%s" : "lower_%s", adj_dev->name);
5001         return sysfs_create_link(&(dev->dev.kobj), &(adj_dev->dev.kobj),
5002                                  linkname);
5003 }
5004 static void netdev_adjacent_sysfs_del(struct net_device *dev,
5005                                char *name,
5006                                struct list_head *dev_list)
5007 {
5008         char linkname[IFNAMSIZ+7];
5009         sprintf(linkname, dev_list == &dev->adj_list.upper ?
5010                 "upper_%s" : "lower_%s", name);
5011         sysfs_remove_link(&(dev->dev.kobj), linkname);
5012 }
5013
5014 static inline bool netdev_adjacent_is_neigh_list(struct net_device *dev,
5015                                                  struct net_device *adj_dev,
5016                                                  struct list_head *dev_list)
5017 {
5018         return (dev_list == &dev->adj_list.upper ||
5019                 dev_list == &dev->adj_list.lower) &&
5020                 net_eq(dev_net(dev), dev_net(adj_dev));
5021 }
5022
5023 static int __netdev_adjacent_dev_insert(struct net_device *dev,
5024                                         struct net_device *adj_dev,
5025                                         struct list_head *dev_list,
5026                                         void *private, bool master)
5027 {
5028         struct netdev_adjacent *adj;
5029         int ret;
5030
5031         adj = __netdev_find_adj(dev, adj_dev, dev_list);
5032
5033         if (adj) {
5034                 adj->ref_nr++;
5035                 return 0;
5036         }
5037
5038         adj = kmalloc(sizeof(*adj), GFP_KERNEL);
5039         if (!adj)
5040                 return -ENOMEM;
5041
5042         adj->dev = adj_dev;
5043         adj->master = master;
5044         adj->ref_nr = 1;
5045         adj->private = private;
5046         dev_hold(adj_dev);
5047
5048         pr_debug("dev_hold for %s, because of link added from %s to %s\n",
5049                  adj_dev->name, dev->name, adj_dev->name);
5050
5051         if (netdev_adjacent_is_neigh_list(dev, adj_dev, dev_list)) {
5052                 ret = netdev_adjacent_sysfs_add(dev, adj_dev, dev_list);
5053                 if (ret)
5054                         goto free_adj;
5055         }
5056
5057         /* Ensure that master link is always the first item in list. */
5058         if (master) {
5059                 ret = sysfs_create_link(&(dev->dev.kobj),
5060                                         &(adj_dev->dev.kobj), "master");
5061                 if (ret)
5062                         goto remove_symlinks;
5063
5064                 list_add_rcu(&adj->list, dev_list);
5065         } else {
5066                 list_add_tail_rcu(&adj->list, dev_list);
5067         }
5068
5069         return 0;
5070
5071 remove_symlinks:
5072         if (netdev_adjacent_is_neigh_list(dev, adj_dev, dev_list))
5073                 netdev_adjacent_sysfs_del(dev, adj_dev->name, dev_list);
5074 free_adj:
5075         kfree(adj);
5076         dev_put(adj_dev);
5077
5078         return ret;
5079 }
5080
5081 static void __netdev_adjacent_dev_remove(struct net_device *dev,
5082                                          struct net_device *adj_dev,
5083                                          struct list_head *dev_list)
5084 {
5085         struct netdev_adjacent *adj;
5086
5087         adj = __netdev_find_adj(dev, adj_dev, dev_list);
5088
5089         if (!adj) {
5090                 pr_err("tried to remove device %s from %s\n",
5091                        dev->name, adj_dev->name);
5092                 BUG();
5093         }
5094
5095         if (adj->ref_nr > 1) {
5096                 pr_debug("%s to %s ref_nr-- = %d\n", dev->name, adj_dev->name,
5097                          adj->ref_nr-1);
5098                 adj->ref_nr--;
5099                 return;
5100         }
5101
5102         if (adj->master)
5103                 sysfs_remove_link(&(dev->dev.kobj), "master");
5104
5105         if (netdev_adjacent_is_neigh_list(dev, adj_dev, dev_list))
5106                 netdev_adjacent_sysfs_del(dev, adj_dev->name, dev_list);
5107
5108         list_del_rcu(&adj->list);
5109         pr_debug("dev_put for %s, because link removed from %s to %s\n",
5110                  adj_dev->name, dev->name, adj_dev->name);
5111         dev_put(adj_dev);
5112         kfree_rcu(adj, rcu);
5113 }
5114
5115 static int __netdev_adjacent_dev_link_lists(struct net_device *dev,
5116                                             struct net_device *upper_dev,
5117                                             struct list_head *up_list,
5118                                             struct list_head *down_list,
5119                                             void *private, bool master)
5120 {
5121         int ret;
5122
5123         ret = __netdev_adjacent_dev_insert(dev, upper_dev, up_list, private,
5124                                            master);
5125         if (ret)
5126                 return ret;
5127
5128         ret = __netdev_adjacent_dev_insert(upper_dev, dev, down_list, private,
5129                                            false);
5130         if (ret) {
5131                 __netdev_adjacent_dev_remove(dev, upper_dev, up_list);
5132                 return ret;
5133         }
5134
5135         return 0;
5136 }
5137
5138 static int __netdev_adjacent_dev_link(struct net_device *dev,
5139                                       struct net_device *upper_dev)
5140 {
5141         return __netdev_adjacent_dev_link_lists(dev, upper_dev,
5142                                                 &dev->all_adj_list.upper,
5143                                                 &upper_dev->all_adj_list.lower,
5144                                                 NULL, false);
5145 }
5146
5147 static void __netdev_adjacent_dev_unlink_lists(struct net_device *dev,
5148                                                struct net_device *upper_dev,
5149                                                struct list_head *up_list,
5150                                                struct list_head *down_list)
5151 {
5152         __netdev_adjacent_dev_remove(dev, upper_dev, up_list);
5153         __netdev_adjacent_dev_remove(upper_dev, dev, down_list);
5154 }
5155
5156 static void __netdev_adjacent_dev_unlink(struct net_device *dev,
5157                                          struct net_device *upper_dev)
5158 {
5159         __netdev_adjacent_dev_unlink_lists(dev, upper_dev,
5160                                            &dev->all_adj_list.upper,
5161                                            &upper_dev->all_adj_list.lower);
5162 }
5163
5164 static int __netdev_adjacent_dev_link_neighbour(struct net_device *dev,
5165                                                 struct net_device *upper_dev,
5166                                                 void *private, bool master)
5167 {
5168         int ret = __netdev_adjacent_dev_link(dev, upper_dev);
5169
5170         if (ret)
5171                 return ret;
5172
5173         ret = __netdev_adjacent_dev_link_lists(dev, upper_dev,
5174                                                &dev->adj_list.upper,
5175                                                &upper_dev->adj_list.lower,
5176                                                private, master);
5177         if (ret) {
5178                 __netdev_adjacent_dev_unlink(dev, upper_dev);
5179                 return ret;
5180         }
5181
5182         return 0;
5183 }
5184
5185 static void __netdev_adjacent_dev_unlink_neighbour(struct net_device *dev,
5186                                                    struct net_device *upper_dev)
5187 {
5188         __netdev_adjacent_dev_unlink(dev, upper_dev);
5189         __netdev_adjacent_dev_unlink_lists(dev, upper_dev,
5190                                            &dev->adj_list.upper,
5191                                            &upper_dev->adj_list.lower);
5192 }
5193
5194 static int __netdev_upper_dev_link(struct net_device *dev,
5195                                    struct net_device *upper_dev, bool master,
5196                                    void *private)
5197 {
5198         struct netdev_adjacent *i, *j, *to_i, *to_j;
5199         int ret = 0;
5200
5201         ASSERT_RTNL();
5202
5203         if (dev == upper_dev)
5204                 return -EBUSY;
5205
5206         /* To prevent loops, check if dev is not upper device to upper_dev. */
5207         if (__netdev_find_adj(upper_dev, dev, &upper_dev->all_adj_list.upper))
5208                 return -EBUSY;
5209
5210         if (__netdev_find_adj(dev, upper_dev, &dev->adj_list.upper))
5211                 return -EEXIST;
5212
5213         if (master && netdev_master_upper_dev_get(dev))
5214                 return -EBUSY;
5215
5216         ret = __netdev_adjacent_dev_link_neighbour(dev, upper_dev, private,
5217                                                    master);
5218         if (ret)
5219                 return ret;
5220
5221         /* Now that we linked these devs, make all the upper_dev's
5222          * all_adj_list.upper visible to every dev's all_adj_list.lower an
5223          * versa, and don't forget the devices itself. All of these
5224          * links are non-neighbours.
5225          */
5226         list_for_each_entry(i, &dev->all_adj_list.lower, list) {
5227                 list_for_each_entry(j, &upper_dev->all_adj_list.upper, list) {
5228                         pr_debug("Interlinking %s with %s, non-neighbour\n",
5229                                  i->dev->name, j->dev->name);
5230                         ret = __netdev_adjacent_dev_link(i->dev, j->dev);
5231                         if (ret)
5232                                 goto rollback_mesh;
5233                 }
5234         }
5235
5236         /* add dev to every upper_dev's upper device */
5237         list_for_each_entry(i, &upper_dev->all_adj_list.upper, list) {
5238                 pr_debug("linking %s's upper device %s with %s\n",
5239                          upper_dev->name, i->dev->name, dev->name);
5240                 ret = __netdev_adjacent_dev_link(dev, i->dev);
5241                 if (ret)
5242                         goto rollback_upper_mesh;
5243         }
5244
5245         /* add upper_dev to every dev's lower device */
5246         list_for_each_entry(i, &dev->all_adj_list.lower, list) {
5247                 pr_debug("linking %s's lower device %s with %s\n", dev->name,
5248                          i->dev->name, upper_dev->name);
5249                 ret = __netdev_adjacent_dev_link(i->dev, upper_dev);
5250                 if (ret)
5251                         goto rollback_lower_mesh;
5252         }
5253
5254         call_netdevice_notifiers(NETDEV_CHANGEUPPER, dev);
5255         return 0;
5256
5257 rollback_lower_mesh:
5258         to_i = i;
5259         list_for_each_entry(i, &dev->all_adj_list.lower, list) {
5260                 if (i == to_i)
5261                         break;
5262                 __netdev_adjacent_dev_unlink(i->dev, upper_dev);
5263         }
5264
5265         i = NULL;
5266
5267 rollback_upper_mesh:
5268         to_i = i;
5269         list_for_each_entry(i, &upper_dev->all_adj_list.upper, list) {
5270                 if (i == to_i)
5271                         break;
5272                 __netdev_adjacent_dev_unlink(dev, i->dev);
5273         }
5274
5275         i = j = NULL;
5276
5277 rollback_mesh:
5278         to_i = i;
5279         to_j = j;
5280         list_for_each_entry(i, &dev->all_adj_list.lower, list) {
5281                 list_for_each_entry(j, &upper_dev->all_adj_list.upper, list) {
5282                         if (i == to_i && j == to_j)
5283                                 break;
5284                         __netdev_adjacent_dev_unlink(i->dev, j->dev);
5285                 }
5286                 if (i == to_i)
5287                         break;
5288         }
5289
5290         __netdev_adjacent_dev_unlink_neighbour(dev, upper_dev);
5291
5292         return ret;
5293 }
5294
5295 /**
5296  * netdev_upper_dev_link - Add a link to the upper device
5297  * @dev: device
5298  * @upper_dev: new upper device
5299  *
5300  * Adds a link to device which is upper to this one. The caller must hold
5301  * the RTNL lock. On a failure a negative errno code is returned.
5302  * On success the reference counts are adjusted and the function
5303  * returns zero.
5304  */
5305 int netdev_upper_dev_link(struct net_device *dev,
5306                           struct net_device *upper_dev)
5307 {
5308         return __netdev_upper_dev_link(dev, upper_dev, false, NULL);
5309 }
5310 EXPORT_SYMBOL(netdev_upper_dev_link);
5311
5312 /**
5313  * netdev_master_upper_dev_link - Add a master link to the upper device
5314  * @dev: device
5315  * @upper_dev: new upper device
5316  *
5317  * Adds a link to device which is upper to this one. In this case, only
5318  * one master upper device can be linked, although other non-master devices
5319  * might be linked as well. The caller must hold the RTNL lock.
5320  * On a failure a negative errno code is returned. On success the reference
5321  * counts are adjusted and the function returns zero.
5322  */
5323 int netdev_master_upper_dev_link(struct net_device *dev,
5324                                  struct net_device *upper_dev)
5325 {
5326         return __netdev_upper_dev_link(dev, upper_dev, true, NULL);
5327 }
5328 EXPORT_SYMBOL(netdev_master_upper_dev_link);
5329
5330 int netdev_master_upper_dev_link_private(struct net_device *dev,
5331                                          struct net_device *upper_dev,
5332                                          void *private)
5333 {
5334         return __netdev_upper_dev_link(dev, upper_dev, true, private);
5335 }
5336 EXPORT_SYMBOL(netdev_master_upper_dev_link_private);
5337
5338 /**
5339  * netdev_upper_dev_unlink - Removes a link to upper device
5340  * @dev: device
5341  * @upper_dev: new upper device
5342  *
5343  * Removes a link to device which is upper to this one. The caller must hold
5344  * the RTNL lock.
5345  */
5346 void netdev_upper_dev_unlink(struct net_device *dev,
5347                              struct net_device *upper_dev)
5348 {
5349         struct netdev_adjacent *i, *j;
5350         ASSERT_RTNL();
5351
5352         __netdev_adjacent_dev_unlink_neighbour(dev, upper_dev);
5353
5354         /* Here is the tricky part. We must remove all dev's lower
5355          * devices from all upper_dev's upper devices and vice
5356          * versa, to maintain the graph relationship.
5357          */
5358         list_for_each_entry(i, &dev->all_adj_list.lower, list)
5359                 list_for_each_entry(j, &upper_dev->all_adj_list.upper, list)
5360                         __netdev_adjacent_dev_unlink(i->dev, j->dev);
5361
5362         /* remove also the devices itself from lower/upper device
5363          * list
5364          */
5365         list_for_each_entry(i, &dev->all_adj_list.lower, list)
5366                 __netdev_adjacent_dev_unlink(i->dev, upper_dev);
5367
5368         list_for_each_entry(i, &upper_dev->all_adj_list.upper, list)
5369                 __netdev_adjacent_dev_unlink(dev, i->dev);
5370
5371         call_netdevice_notifiers(NETDEV_CHANGEUPPER, dev);
5372 }
5373 EXPORT_SYMBOL(netdev_upper_dev_unlink);
5374
5375 /**
5376  * netdev_bonding_info_change - Dispatch event about slave change
5377  * @dev: device
5378  * @bonding_info: info to dispatch
5379  *
5380  * Send NETDEV_BONDING_INFO to netdev notifiers with info.
5381  * The caller must hold the RTNL lock.
5382  */
5383 void netdev_bonding_info_change(struct net_device *dev,
5384                                 struct netdev_bonding_info *bonding_info)
5385 {
5386         struct netdev_notifier_bonding_info     info;
5387
5388         memcpy(&info.bonding_info, bonding_info,
5389                sizeof(struct netdev_bonding_info));
5390         call_netdevice_notifiers_info(NETDEV_BONDING_INFO, dev,
5391                                       &info.info);
5392 }
5393 EXPORT_SYMBOL(netdev_bonding_info_change);
5394
5395 static void netdev_adjacent_add_links(struct net_device *dev)
5396 {
5397         struct netdev_adjacent *iter;
5398
5399         struct net *net = dev_net(dev);
5400
5401         list_for_each_entry(iter, &dev->adj_list.upper, list) {
5402                 if (!net_eq(net,dev_net(iter->dev)))
5403                         continue;
5404                 netdev_adjacent_sysfs_add(iter->dev, dev,
5405                                           &iter->dev->adj_list.lower);
5406                 netdev_adjacent_sysfs_add(dev, iter->dev,
5407                                           &dev->adj_list.upper);
5408         }
5409
5410         list_for_each_entry(iter, &dev->adj_list.lower, list) {
5411                 if (!net_eq(net,dev_net(iter->dev)))
5412                         continue;
5413                 netdev_adjacent_sysfs_add(iter->dev, dev,
5414                                           &iter->dev->adj_list.upper);
5415                 netdev_adjacent_sysfs_add(dev, iter->dev,
5416                                           &dev->adj_list.lower);
5417         }
5418 }
5419
5420 static void netdev_adjacent_del_links(struct net_device *dev)
5421 {
5422         struct netdev_adjacent *iter;
5423
5424         struct net *net = dev_net(dev);
5425
5426         list_for_each_entry(iter, &dev->adj_list.upper, list) {
5427                 if (!net_eq(net,dev_net(iter->dev)))
5428                         continue;
5429                 netdev_adjacent_sysfs_del(iter->dev, dev->name,
5430                                           &iter->dev->adj_list.lower);
5431                 netdev_adjacent_sysfs_del(dev, iter->dev->name,
5432                                           &dev->adj_list.upper);
5433         }
5434
5435         list_for_each_entry(iter, &dev->adj_list.lower, list) {
5436                 if (!net_eq(net,dev_net(iter->dev)))
5437                         continue;
5438                 netdev_adjacent_sysfs_del(iter->dev, dev->name,
5439                                           &iter->dev->adj_list.upper);
5440                 netdev_adjacent_sysfs_del(dev, iter->dev->name,
5441                                           &dev->adj_list.lower);
5442         }
5443 }
5444
5445 void netdev_adjacent_rename_links(struct net_device *dev, char *oldname)
5446 {
5447         struct netdev_adjacent *iter;
5448
5449         struct net *net = dev_net(dev);
5450
5451         list_for_each_entry(iter, &dev->adj_list.upper, list) {
5452                 if (!net_eq(net,dev_net(iter->dev)))
5453                         continue;
5454                 netdev_adjacent_sysfs_del(iter->dev, oldname,
5455                                           &iter->dev->adj_list.lower);
5456                 netdev_adjacent_sysfs_add(iter->dev, dev,
5457                                           &iter->dev->adj_list.lower);
5458         }
5459
5460         list_for_each_entry(iter, &dev->adj_list.lower, list) {
5461                 if (!net_eq(net,dev_net(iter->dev)))
5462                         continue;
5463                 netdev_adjacent_sysfs_del(iter->dev, oldname,
5464                                           &iter->dev->adj_list.upper);
5465                 netdev_adjacent_sysfs_add(iter->dev, dev,
5466                                           &iter->dev->adj_list.upper);
5467         }
5468 }
5469
5470 void *netdev_lower_dev_get_private(struct net_device *dev,
5471                                    struct net_device *lower_dev)
5472 {
5473         struct netdev_adjacent *lower;
5474
5475         if (!lower_dev)
5476                 return NULL;
5477         lower = __netdev_find_adj(dev, lower_dev, &dev->adj_list.lower);
5478         if (!lower)
5479                 return NULL;
5480
5481         return lower->private;
5482 }
5483 EXPORT_SYMBOL(netdev_lower_dev_get_private);
5484
5485
5486 int dev_get_nest_level(struct net_device *dev,
5487                        bool (*type_check)(struct net_device *dev))
5488 {
5489         struct net_device *lower = NULL;
5490         struct list_head *iter;
5491         int max_nest = -1;
5492         int nest;
5493
5494         ASSERT_RTNL();
5495
5496         netdev_for_each_lower_dev(dev, lower, iter) {
5497                 nest = dev_get_nest_level(lower, type_check);
5498                 if (max_nest < nest)
5499                         max_nest = nest;
5500         }
5501
5502         if (type_check(dev))
5503                 max_nest++;
5504
5505         return max_nest;
5506 }
5507 EXPORT_SYMBOL(dev_get_nest_level);
5508
5509 static void dev_change_rx_flags(struct net_device *dev, int flags)
5510 {
5511         const struct net_device_ops *ops = dev->netdev_ops;
5512
5513         if (ops->ndo_change_rx_flags)
5514                 ops->ndo_change_rx_flags(dev, flags);
5515 }
5516
5517 static int __dev_set_promiscuity(struct net_device *dev, int inc, bool notify)
5518 {
5519         unsigned int old_flags = dev->flags;
5520         kuid_t uid;
5521         kgid_t gid;
5522
5523         ASSERT_RTNL();
5524
5525         dev->flags |= IFF_PROMISC;
5526         dev->promiscuity += inc;
5527         if (dev->promiscuity == 0) {
5528                 /*
5529                  * Avoid overflow.
5530                  * If inc causes overflow, untouch promisc and return error.
5531                  */
5532                 if (inc < 0)
5533                         dev->flags &= ~IFF_PROMISC;
5534                 else {
5535                         dev->promiscuity -= inc;
5536                         pr_warn("%s: promiscuity touches roof, set promiscuity failed. promiscuity feature of device might be broken.\n",
5537                                 dev->name);
5538                         return -EOVERFLOW;
5539                 }
5540         }
5541         if (dev->flags != old_flags) {
5542                 pr_info("device %s %s promiscuous mode\n",
5543                         dev->name,
5544                         dev->flags & IFF_PROMISC ? "entered" : "left");
5545                 if (audit_enabled) {
5546                         current_uid_gid(&uid, &gid);
5547                         audit_log(current->audit_context, GFP_ATOMIC,
5548                                 AUDIT_ANOM_PROMISCUOUS,
5549                                 "dev=%s prom=%d old_prom=%d auid=%u uid=%u gid=%u ses=%u",
5550                                 dev->name, (dev->flags & IFF_PROMISC),
5551                                 (old_flags & IFF_PROMISC),
5552                                 from_kuid(&init_user_ns, audit_get_loginuid(current)),
5553                                 from_kuid(&init_user_ns, uid),
5554                                 from_kgid(&init_user_ns, gid),
5555                                 audit_get_sessionid(current));
5556                 }
5557
5558                 dev_change_rx_flags(dev, IFF_PROMISC);
5559         }
5560         if (notify)
5561                 __dev_notify_flags(dev, old_flags, IFF_PROMISC);
5562         return 0;
5563 }
5564
5565 /**
5566  *      dev_set_promiscuity     - update promiscuity count on a device
5567  *      @dev: device
5568  *      @inc: modifier
5569  *
5570  *      Add or remove promiscuity from a device. While the count in the device
5571  *      remains above zero the interface remains promiscuous. Once it hits zero
5572  *      the device reverts back to normal filtering operation. A negative inc
5573  *      value is used to drop promiscuity on the device.
5574  *      Return 0 if successful or a negative errno code on error.
5575  */
5576 int dev_set_promiscuity(struct net_device *dev, int inc)
5577 {
5578         unsigned int old_flags = dev->flags;
5579         int err;
5580
5581         err = __dev_set_promiscuity(dev, inc, true);
5582         if (err < 0)
5583                 return err;
5584         if (dev->flags != old_flags)
5585                 dev_set_rx_mode(dev);
5586         return err;
5587 }
5588 EXPORT_SYMBOL(dev_set_promiscuity);
5589
5590 static int __dev_set_allmulti(struct net_device *dev, int inc, bool notify)
5591 {
5592         unsigned int old_flags = dev->flags, old_gflags = dev->gflags;
5593
5594         ASSERT_RTNL();
5595
5596         dev->flags |= IFF_ALLMULTI;
5597         dev->allmulti += inc;
5598         if (dev->allmulti == 0) {
5599                 /*
5600                  * Avoid overflow.
5601                  * If inc causes overflow, untouch allmulti and return error.
5602                  */
5603                 if (inc < 0)
5604                         dev->flags &= ~IFF_ALLMULTI;
5605                 else {
5606                         dev->allmulti -= inc;
5607                         pr_warn("%s: allmulti touches roof, set allmulti failed. allmulti feature of device might be broken.\n",
5608                                 dev->name);
5609                         return -EOVERFLOW;
5610                 }
5611         }
5612         if (dev->flags ^ old_flags) {
5613                 dev_change_rx_flags(dev, IFF_ALLMULTI);
5614                 dev_set_rx_mode(dev);
5615                 if (notify)
5616                         __dev_notify_flags(dev, old_flags,
5617                                            dev->gflags ^ old_gflags);
5618         }
5619         return 0;
5620 }
5621
5622 /**
5623  *      dev_set_allmulti        - update allmulti count on a device
5624  *      @dev: device
5625  *      @inc: modifier
5626  *
5627  *      Add or remove reception of all multicast frames to a device. While the
5628  *      count in the device remains above zero the interface remains listening
5629  *      to all interfaces. Once it hits zero the device reverts back to normal
5630  *      filtering operation. A negative @inc value is used to drop the counter
5631  *      when releasing a resource needing all multicasts.
5632  *      Return 0 if successful or a negative errno code on error.
5633  */
5634
5635 int dev_set_allmulti(struct net_device *dev, int inc)
5636 {
5637         return __dev_set_allmulti(dev, inc, true);
5638 }
5639 EXPORT_SYMBOL(dev_set_allmulti);
5640
5641 /*
5642  *      Upload unicast and multicast address lists to device and
5643  *      configure RX filtering. When the device doesn't support unicast
5644  *      filtering it is put in promiscuous mode while unicast addresses
5645  *      are present.
5646  */
5647 void __dev_set_rx_mode(struct net_device *dev)
5648 {
5649         const struct net_device_ops *ops = dev->netdev_ops;
5650
5651         /* dev_open will call this function so the list will stay sane. */
5652         if (!(dev->flags&IFF_UP))
5653                 return;
5654
5655         if (!netif_device_present(dev))
5656                 return;
5657
5658         if (!(dev->priv_flags & IFF_UNICAST_FLT)) {
5659                 /* Unicast addresses changes may only happen under the rtnl,
5660                  * therefore calling __dev_set_promiscuity here is safe.
5661                  */
5662                 if (!netdev_uc_empty(dev) && !dev->uc_promisc) {
5663                         __dev_set_promiscuity(dev, 1, false);
5664                         dev->uc_promisc = true;
5665                 } else if (netdev_uc_empty(dev) && dev->uc_promisc) {
5666                         __dev_set_promiscuity(dev, -1, false);
5667                         dev->uc_promisc = false;
5668                 }
5669         }
5670
5671         if (ops->ndo_set_rx_mode)
5672                 ops->ndo_set_rx_mode(dev);
5673 }
5674
5675 void dev_set_rx_mode(struct net_device *dev)
5676 {
5677         netif_addr_lock_bh(dev);
5678         __dev_set_rx_mode(dev);
5679         netif_addr_unlock_bh(dev);
5680 }
5681
5682 /**
5683  *      dev_get_flags - get flags reported to userspace
5684  *      @dev: device
5685  *
5686  *      Get the combination of flag bits exported through APIs to userspace.
5687  */
5688 unsigned int dev_get_flags(const struct net_device *dev)
5689 {
5690         unsigned int flags;
5691
5692         flags = (dev->flags & ~(IFF_PROMISC |
5693                                 IFF_ALLMULTI |
5694                                 IFF_RUNNING |
5695                                 IFF_LOWER_UP |
5696                                 IFF_DORMANT)) |
5697                 (dev->gflags & (IFF_PROMISC |
5698                                 IFF_ALLMULTI));
5699
5700         if (netif_running(dev)) {
5701                 if (netif_oper_up(dev))
5702                         flags |= IFF_RUNNING;
5703                 if (netif_carrier_ok(dev))
5704                         flags |= IFF_LOWER_UP;
5705                 if (netif_dormant(dev))
5706                         flags |= IFF_DORMANT;
5707         }
5708
5709         return flags;
5710 }
5711 EXPORT_SYMBOL(dev_get_flags);
5712
5713 int __dev_change_flags(struct net_device *dev, unsigned int flags)
5714 {
5715         unsigned int old_flags = dev->flags;
5716         int ret;
5717
5718         ASSERT_RTNL();
5719
5720         /*
5721          *      Set the flags on our device.
5722          */
5723
5724         dev->flags = (flags & (IFF_DEBUG | IFF_NOTRAILERS | IFF_NOARP |
5725                                IFF_DYNAMIC | IFF_MULTICAST | IFF_PORTSEL |
5726                                IFF_AUTOMEDIA)) |
5727                      (dev->flags & (IFF_UP | IFF_VOLATILE | IFF_PROMISC |
5728                                     IFF_ALLMULTI));
5729
5730         /*
5731          *      Load in the correct multicast list now the flags have changed.
5732          */
5733
5734         if ((old_flags ^ flags) & IFF_MULTICAST)
5735                 dev_change_rx_flags(dev, IFF_MULTICAST);
5736
5737         dev_set_rx_mode(dev);
5738
5739         /*
5740          *      Have we downed the interface. We handle IFF_UP ourselves
5741          *      according to user attempts to set it, rather than blindly
5742          *      setting it.
5743          */
5744
5745         ret = 0;
5746         if ((old_flags ^ flags) & IFF_UP)
5747                 ret = ((old_flags & IFF_UP) ? __dev_close : __dev_open)(dev);
5748
5749         if ((flags ^ dev->gflags) & IFF_PROMISC) {
5750                 int inc = (flags & IFF_PROMISC) ? 1 : -1;
5751                 unsigned int old_flags = dev->flags;
5752
5753                 dev->gflags ^= IFF_PROMISC;
5754
5755                 if (__dev_set_promiscuity(dev, inc, false) >= 0)
5756                         if (dev->flags != old_flags)
5757                                 dev_set_rx_mode(dev);
5758         }
5759
5760         /* NOTE: order of synchronization of IFF_PROMISC and IFF_ALLMULTI
5761            is important. Some (broken) drivers set IFF_PROMISC, when
5762            IFF_ALLMULTI is requested not asking us and not reporting.
5763          */
5764         if ((flags ^ dev->gflags) & IFF_ALLMULTI) {
5765                 int inc = (flags & IFF_ALLMULTI) ? 1 : -1;
5766
5767                 dev->gflags ^= IFF_ALLMULTI;
5768                 __dev_set_allmulti(dev, inc, false);
5769         }
5770
5771         return ret;
5772 }
5773
5774 void __dev_notify_flags(struct net_device *dev, unsigned int old_flags,
5775                         unsigned int gchanges)
5776 {
5777         unsigned int changes = dev->flags ^ old_flags;
5778
5779         if (gchanges)
5780                 rtmsg_ifinfo(RTM_NEWLINK, dev, gchanges, GFP_ATOMIC);
5781
5782         if (changes & IFF_UP) {
5783                 if (dev->flags & IFF_UP)
5784                         call_netdevice_notifiers(NETDEV_UP, dev);
5785                 else
5786                         call_netdevice_notifiers(NETDEV_DOWN, dev);
5787         }
5788
5789         if (dev->flags & IFF_UP &&
5790             (changes & ~(IFF_UP | IFF_PROMISC | IFF_ALLMULTI | IFF_VOLATILE))) {
5791                 struct netdev_notifier_change_info change_info;
5792
5793                 change_info.flags_changed = changes;
5794                 call_netdevice_notifiers_info(NETDEV_CHANGE, dev,
5795                                               &change_info.info);
5796         }
5797 }
5798
5799 /**
5800  *      dev_change_flags - change device settings
5801  *      @dev: device
5802  *      @flags: device state flags
5803  *
5804  *      Change settings on device based state flags. The flags are
5805  *      in the userspace exported format.
5806  */
5807 int dev_change_flags(struct net_device *dev, unsigned int flags)
5808 {
5809         int ret;
5810         unsigned int changes, old_flags = dev->flags, old_gflags = dev->gflags;
5811
5812         ret = __dev_change_flags(dev, flags);
5813         if (ret < 0)
5814                 return ret;
5815
5816         changes = (old_flags ^ dev->flags) | (old_gflags ^ dev->gflags);
5817         __dev_notify_flags(dev, old_flags, changes);
5818         return ret;
5819 }
5820 EXPORT_SYMBOL(dev_change_flags);
5821
5822 static int __dev_set_mtu(struct net_device *dev, int new_mtu)
5823 {
5824         const struct net_device_ops *ops = dev->netdev_ops;
5825
5826         if (ops->ndo_change_mtu)
5827                 return ops->ndo_change_mtu(dev, new_mtu);
5828
5829         dev->mtu = new_mtu;
5830         return 0;
5831 }
5832
5833 /**
5834  *      dev_set_mtu - Change maximum transfer unit
5835  *      @dev: device
5836  *      @new_mtu: new transfer unit
5837  *
5838  *      Change the maximum transfer size of the network device.
5839  */
5840 int dev_set_mtu(struct net_device *dev, int new_mtu)
5841 {
5842         int err, orig_mtu;
5843
5844         if (new_mtu == dev->mtu)
5845                 return 0;
5846
5847         /*      MTU must be positive.    */
5848         if (new_mtu < 0)
5849                 return -EINVAL;
5850
5851         if (!netif_device_present(dev))
5852                 return -ENODEV;
5853
5854         err = call_netdevice_notifiers(NETDEV_PRECHANGEMTU, dev);
5855         err = notifier_to_errno(err);
5856         if (err)
5857                 return err;
5858
5859         orig_mtu = dev->mtu;
5860         err = __dev_set_mtu(dev, new_mtu);
5861
5862         if (!err) {
5863                 err = call_netdevice_notifiers(NETDEV_CHANGEMTU, dev);
5864                 err = notifier_to_errno(err);
5865                 if (err) {
5866                         /* setting mtu back and notifying everyone again,
5867                          * so that they have a chance to revert changes.
5868                          */
5869                         __dev_set_mtu(dev, orig_mtu);
5870                         call_netdevice_notifiers(NETDEV_CHANGEMTU, dev);
5871                 }
5872         }
5873         return err;
5874 }
5875 EXPORT_SYMBOL(dev_set_mtu);
5876
5877 /**
5878  *      dev_set_group - Change group this device belongs to
5879  *      @dev: device
5880  *      @new_group: group this device should belong to
5881  */
5882 void dev_set_group(struct net_device *dev, int new_group)
5883 {
5884         dev->group = new_group;
5885 }
5886 EXPORT_SYMBOL(dev_set_group);
5887
5888 /**
5889  *      dev_set_mac_address - Change Media Access Control Address
5890  *      @dev: device
5891  *      @sa: new address
5892  *
5893  *      Change the hardware (MAC) address of the device
5894  */
5895 int dev_set_mac_address(struct net_device *dev, struct sockaddr *sa)
5896 {
5897         const struct net_device_ops *ops = dev->netdev_ops;
5898         int err;
5899
5900         if (!ops->ndo_set_mac_address)
5901                 return -EOPNOTSUPP;
5902         if (sa->sa_family != dev->type)
5903                 return -EINVAL;
5904         if (!netif_device_present(dev))
5905                 return -ENODEV;
5906         err = ops->ndo_set_mac_address(dev, sa);
5907         if (err)
5908                 return err;
5909         dev->addr_assign_type = NET_ADDR_SET;
5910         call_netdevice_notifiers(NETDEV_CHANGEADDR, dev);
5911         add_device_randomness(dev->dev_addr, dev->addr_len);
5912         return 0;
5913 }
5914 EXPORT_SYMBOL(dev_set_mac_address);
5915
5916 /**
5917  *      dev_change_carrier - Change device carrier
5918  *      @dev: device
5919  *      @new_carrier: new value
5920  *
5921  *      Change device carrier
5922  */
5923 int dev_change_carrier(struct net_device *dev, bool new_carrier)
5924 {
5925         const struct net_device_ops *ops = dev->netdev_ops;
5926
5927         if (!ops->ndo_change_carrier)
5928                 return -EOPNOTSUPP;
5929         if (!netif_device_present(dev))
5930                 return -ENODEV;
5931         return ops->ndo_change_carrier(dev, new_carrier);
5932 }
5933 EXPORT_SYMBOL(dev_change_carrier);
5934
5935 /**
5936  *      dev_get_phys_port_id - Get device physical port ID
5937  *      @dev: device
5938  *      @ppid: port ID
5939  *
5940  *      Get device physical port ID
5941  */
5942 int dev_get_phys_port_id(struct net_device *dev,
5943                          struct netdev_phys_item_id *ppid)
5944 {
5945         const struct net_device_ops *ops = dev->netdev_ops;
5946
5947         if (!ops->ndo_get_phys_port_id)
5948                 return -EOPNOTSUPP;
5949         return ops->ndo_get_phys_port_id(dev, ppid);
5950 }
5951 EXPORT_SYMBOL(dev_get_phys_port_id);
5952
5953 /**
5954  *      dev_get_phys_port_name - Get device physical port name
5955  *      @dev: device
5956  *      @name: port name
5957  *
5958  *      Get device physical port name
5959  */
5960 int dev_get_phys_port_name(struct net_device *dev,
5961                            char *name, size_t len)
5962 {
5963         const struct net_device_ops *ops = dev->netdev_ops;
5964
5965         if (!ops->ndo_get_phys_port_name)
5966                 return -EOPNOTSUPP;
5967         return ops->ndo_get_phys_port_name(dev, name, len);
5968 }
5969 EXPORT_SYMBOL(dev_get_phys_port_name);
5970
5971 /**
5972  *      dev_new_index   -       allocate an ifindex
5973  *      @net: the applicable net namespace
5974  *
5975  *      Returns a suitable unique value for a new device interface
5976  *      number.  The caller must hold the rtnl semaphore or the
5977  *      dev_base_lock to be sure it remains unique.
5978  */
5979 static int dev_new_index(struct net *net)
5980 {
5981         int ifindex = net->ifindex;
5982         for (;;) {
5983                 if (++ifindex <= 0)
5984                         ifindex = 1;
5985                 if (!__dev_get_by_index(net, ifindex))
5986                         return net->ifindex = ifindex;
5987         }
5988 }
5989
5990 /* Delayed registration/unregisteration */
5991 static LIST_HEAD(net_todo_list);
5992 DECLARE_WAIT_QUEUE_HEAD(netdev_unregistering_wq);
5993
5994 static void net_set_todo(struct net_device *dev)
5995 {
5996         list_add_tail(&dev->todo_list, &net_todo_list);
5997         dev_net(dev)->dev_unreg_count++;
5998 }
5999
6000 static void rollback_registered_many(struct list_head *head)
6001 {
6002         struct net_device *dev, *tmp;
6003         LIST_HEAD(close_head);
6004
6005         BUG_ON(dev_boot_phase);
6006         ASSERT_RTNL();
6007
6008         list_for_each_entry_safe(dev, tmp, head, unreg_list) {
6009                 /* Some devices call without registering
6010                  * for initialization unwind. Remove those
6011                  * devices and proceed with the remaining.
6012                  */
6013                 if (dev->reg_state == NETREG_UNINITIALIZED) {
6014                         pr_debug("unregister_netdevice: device %s/%p never was registered\n",
6015                                  dev->name, dev);
6016
6017                         WARN_ON(1);
6018                         list_del(&dev->unreg_list);
6019                         continue;
6020                 }
6021                 dev->dismantle = true;
6022                 BUG_ON(dev->reg_state != NETREG_REGISTERED);
6023         }
6024
6025         /* If device is running, close it first. */
6026         list_for_each_entry(dev, head, unreg_list)
6027                 list_add_tail(&dev->close_list, &close_head);
6028         dev_close_many(&close_head, true);
6029
6030         list_for_each_entry(dev, head, unreg_list) {
6031                 /* And unlink it from device chain. */
6032                 unlist_netdevice(dev);
6033
6034                 dev->reg_state = NETREG_UNREGISTERING;
6035                 on_each_cpu(flush_backlog, dev, 1);
6036         }
6037
6038         synchronize_net();
6039
6040         list_for_each_entry(dev, head, unreg_list) {
6041                 struct sk_buff *skb = NULL;
6042
6043                 /* Shutdown queueing discipline. */
6044                 dev_shutdown(dev);
6045
6046
6047                 /* Notify protocols, that we are about to destroy
6048                    this device. They should clean all the things.
6049                 */
6050                 call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
6051
6052                 if (!dev->rtnl_link_ops ||
6053                     dev->rtnl_link_state == RTNL_LINK_INITIALIZED)
6054                         skb = rtmsg_ifinfo_build_skb(RTM_DELLINK, dev, ~0U,
6055                                                      GFP_KERNEL);
6056
6057                 /*
6058                  *      Flush the unicast and multicast chains
6059                  */
6060                 dev_uc_flush(dev);
6061                 dev_mc_flush(dev);
6062
6063                 if (dev->netdev_ops->ndo_uninit)
6064                         dev->netdev_ops->ndo_uninit(dev);
6065
6066                 if (skb)
6067                         rtmsg_ifinfo_send(skb, dev, GFP_KERNEL);
6068
6069                 /* Notifier chain MUST detach us all upper devices. */
6070                 WARN_ON(netdev_has_any_upper_dev(dev));
6071
6072                 /* Remove entries from kobject tree */
6073                 netdev_unregister_kobject(dev);
6074 #ifdef CONFIG_XPS
6075                 /* Remove XPS queueing entries */
6076                 netif_reset_xps_queues_gt(dev, 0);
6077 #endif
6078         }
6079
6080         synchronize_net();
6081
6082         list_for_each_entry(dev, head, unreg_list)
6083                 dev_put(dev);
6084 }
6085
6086 static void rollback_registered(struct net_device *dev)
6087 {
6088         LIST_HEAD(single);
6089
6090         list_add(&dev->unreg_list, &single);
6091         rollback_registered_many(&single);
6092         list_del(&single);
6093 }
6094
6095 static netdev_features_t netdev_fix_features(struct net_device *dev,
6096         netdev_features_t features)
6097 {
6098         /* Fix illegal checksum combinations */
6099         if ((features & NETIF_F_HW_CSUM) &&
6100             (features & (NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))) {
6101                 netdev_warn(dev, "mixed HW and IP checksum settings.\n");
6102                 features &= ~(NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM);
6103         }
6104
6105         /* TSO requires that SG is present as well. */
6106         if ((features & NETIF_F_ALL_TSO) && !(features & NETIF_F_SG)) {
6107                 netdev_dbg(dev, "Dropping TSO features since no SG feature.\n");
6108                 features &= ~NETIF_F_ALL_TSO;
6109         }
6110
6111         if ((features & NETIF_F_TSO) && !(features & NETIF_F_HW_CSUM) &&
6112                                         !(features & NETIF_F_IP_CSUM)) {
6113                 netdev_dbg(dev, "Dropping TSO features since no CSUM feature.\n");
6114                 features &= ~NETIF_F_TSO;
6115                 features &= ~NETIF_F_TSO_ECN;
6116         }
6117
6118         if ((features & NETIF_F_TSO6) && !(features & NETIF_F_HW_CSUM) &&
6119                                          !(features & NETIF_F_IPV6_CSUM)) {
6120                 netdev_dbg(dev, "Dropping TSO6 features since no CSUM feature.\n");
6121                 features &= ~NETIF_F_TSO6;
6122         }
6123
6124         /* TSO ECN requires that TSO is present as well. */
6125         if ((features & NETIF_F_ALL_TSO) == NETIF_F_TSO_ECN)
6126                 features &= ~NETIF_F_TSO_ECN;
6127
6128         /* Software GSO depends on SG. */
6129         if ((features & NETIF_F_GSO) && !(features & NETIF_F_SG)) {
6130                 netdev_dbg(dev, "Dropping NETIF_F_GSO since no SG feature.\n");
6131                 features &= ~NETIF_F_GSO;
6132         }
6133
6134         /* UFO needs SG and checksumming */
6135         if (features & NETIF_F_UFO) {
6136                 /* maybe split UFO into V4 and V6? */
6137                 if (!((features & NETIF_F_GEN_CSUM) ||
6138                     (features & (NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))
6139                             == (NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))) {
6140                         netdev_dbg(dev,
6141                                 "Dropping NETIF_F_UFO since no checksum offload features.\n");
6142                         features &= ~NETIF_F_UFO;
6143                 }
6144
6145                 if (!(features & NETIF_F_SG)) {
6146                         netdev_dbg(dev,
6147                                 "Dropping NETIF_F_UFO since no NETIF_F_SG feature.\n");
6148                         features &= ~NETIF_F_UFO;
6149                 }
6150         }
6151
6152 #ifdef CONFIG_NET_RX_BUSY_POLL
6153         if (dev->netdev_ops->ndo_busy_poll)
6154                 features |= NETIF_F_BUSY_POLL;
6155         else
6156 #endif
6157                 features &= ~NETIF_F_BUSY_POLL;
6158
6159         return features;
6160 }
6161
6162 int __netdev_update_features(struct net_device *dev)
6163 {
6164         netdev_features_t features;
6165         int err = 0;
6166
6167         ASSERT_RTNL();
6168
6169         features = netdev_get_wanted_features(dev);
6170
6171         if (dev->netdev_ops->ndo_fix_features)
6172                 features = dev->netdev_ops->ndo_fix_features(dev, features);
6173
6174         /* driver might be less strict about feature dependencies */
6175         features = netdev_fix_features(dev, features);
6176
6177         if (dev->features == features)
6178                 return 0;
6179
6180         netdev_dbg(dev, "Features changed: %pNF -> %pNF\n",
6181                 &dev->features, &features);
6182
6183         if (dev->netdev_ops->ndo_set_features)
6184                 err = dev->netdev_ops->ndo_set_features(dev, features);
6185
6186         if (unlikely(err < 0)) {
6187                 netdev_err(dev,
6188                         "set_features() failed (%d); wanted %pNF, left %pNF\n",
6189                         err, &features, &dev->features);
6190                 return -1;
6191         }
6192
6193         if (!err)
6194                 dev->features = features;
6195
6196         return 1;
6197 }
6198
6199 /**
6200  *      netdev_update_features - recalculate device features
6201  *      @dev: the device to check
6202  *
6203  *      Recalculate dev->features set and send notifications if it
6204  *      has changed. Should be called after driver or hardware dependent
6205  *      conditions might have changed that influence the features.
6206  */
6207 void netdev_update_features(struct net_device *dev)
6208 {
6209         if (__netdev_update_features(dev))
6210                 netdev_features_change(dev);
6211 }
6212 EXPORT_SYMBOL(netdev_update_features);
6213
6214 /**
6215  *      netdev_change_features - recalculate device features
6216  *      @dev: the device to check
6217  *
6218  *      Recalculate dev->features set and send notifications even
6219  *      if they have not changed. Should be called instead of
6220  *      netdev_update_features() if also dev->vlan_features might
6221  *      have changed to allow the changes to be propagated to stacked
6222  *      VLAN devices.
6223  */
6224 void netdev_change_features(struct net_device *dev)
6225 {
6226         __netdev_update_features(dev);
6227         netdev_features_change(dev);
6228 }
6229 EXPORT_SYMBOL(netdev_change_features);
6230
6231 /**
6232  *      netif_stacked_transfer_operstate -      transfer operstate
6233  *      @rootdev: the root or lower level device to transfer state from
6234  *      @dev: the device to transfer operstate to
6235  *
6236  *      Transfer operational state from root to device. This is normally
6237  *      called when a stacking relationship exists between the root
6238  *      device and the device(a leaf device).
6239  */
6240 void netif_stacked_transfer_operstate(const struct net_device *rootdev,
6241                                         struct net_device *dev)
6242 {
6243         if (rootdev->operstate == IF_OPER_DORMANT)
6244                 netif_dormant_on(dev);
6245         else
6246                 netif_dormant_off(dev);
6247
6248         if (netif_carrier_ok(rootdev)) {
6249                 if (!netif_carrier_ok(dev))
6250                         netif_carrier_on(dev);
6251         } else {
6252                 if (netif_carrier_ok(dev))
6253                         netif_carrier_off(dev);
6254         }
6255 }
6256 EXPORT_SYMBOL(netif_stacked_transfer_operstate);
6257
6258 #ifdef CONFIG_SYSFS
6259 static int netif_alloc_rx_queues(struct net_device *dev)
6260 {
6261         unsigned int i, count = dev->num_rx_queues;
6262         struct netdev_rx_queue *rx;
6263         size_t sz = count * sizeof(*rx);
6264
6265         BUG_ON(count < 1);
6266
6267         rx = kzalloc(sz, GFP_KERNEL | __GFP_NOWARN | __GFP_REPEAT);
6268         if (!rx) {
6269                 rx = vzalloc(sz);
6270                 if (!rx)
6271                         return -ENOMEM;
6272         }
6273         dev->_rx = rx;
6274
6275         for (i = 0; i < count; i++)
6276                 rx[i].dev = dev;
6277         return 0;
6278 }
6279 #endif
6280
6281 static void netdev_init_one_queue(struct net_device *dev,
6282                                   struct netdev_queue *queue, void *_unused)
6283 {
6284         /* Initialize queue lock */
6285         spin_lock_init(&queue->_xmit_lock);
6286         netdev_set_xmit_lockdep_class(&queue->_xmit_lock, dev->type);
6287         queue->xmit_lock_owner = -1;
6288         netdev_queue_numa_node_write(queue, NUMA_NO_NODE);
6289         queue->dev = dev;
6290 #ifdef CONFIG_BQL
6291         dql_init(&queue->dql, HZ);
6292 #endif
6293 }
6294
6295 static void netif_free_tx_queues(struct net_device *dev)
6296 {
6297         kvfree(dev->_tx);
6298 }
6299
6300 static int netif_alloc_netdev_queues(struct net_device *dev)
6301 {
6302         unsigned int count = dev->num_tx_queues;
6303         struct netdev_queue *tx;
6304         size_t sz = count * sizeof(*tx);
6305
6306         if (count < 1 || count > 0xffff)
6307                 return -EINVAL;
6308
6309         tx = kzalloc(sz, GFP_KERNEL | __GFP_NOWARN | __GFP_REPEAT);
6310         if (!tx) {
6311                 tx = vzalloc(sz);
6312                 if (!tx)
6313                         return -ENOMEM;
6314         }
6315         dev->_tx = tx;
6316
6317         netdev_for_each_tx_queue(dev, netdev_init_one_queue, NULL);
6318         spin_lock_init(&dev->tx_global_lock);
6319
6320         return 0;
6321 }
6322
6323 /**
6324  *      register_netdevice      - register a network device
6325  *      @dev: device to register
6326  *
6327  *      Take a completed network device structure and add it to the kernel
6328  *      interfaces. A %NETDEV_REGISTER message is sent to the netdev notifier
6329  *      chain. 0 is returned on success. A negative errno code is returned
6330  *      on a failure to set up the device, or if the name is a duplicate.
6331  *
6332  *      Callers must hold the rtnl semaphore. You may want
6333  *      register_netdev() instead of this.
6334  *
6335  *      BUGS:
6336  *      The locking appears insufficient to guarantee two parallel registers
6337  *      will not get the same name.
6338  */
6339
6340 int register_netdevice(struct net_device *dev)
6341 {
6342         int ret;
6343         struct net *net = dev_net(dev);
6344
6345         BUG_ON(dev_boot_phase);
6346         ASSERT_RTNL();
6347
6348         might_sleep();
6349
6350         /* When net_device's are persistent, this will be fatal. */
6351         BUG_ON(dev->reg_state != NETREG_UNINITIALIZED);
6352         BUG_ON(!net);
6353
6354         spin_lock_init(&dev->addr_list_lock);
6355         netdev_set_addr_lockdep_class(dev);
6356
6357         ret = dev_get_valid_name(net, dev, dev->name);
6358         if (ret < 0)
6359                 goto out;
6360
6361         /* Init, if this function is available */
6362         if (dev->netdev_ops->ndo_init) {
6363                 ret = dev->netdev_ops->ndo_init(dev);
6364                 if (ret) {
6365                         if (ret > 0)
6366                                 ret = -EIO;
6367                         goto out;
6368                 }
6369         }
6370
6371         if (((dev->hw_features | dev->features) &
6372              NETIF_F_HW_VLAN_CTAG_FILTER) &&
6373             (!dev->netdev_ops->ndo_vlan_rx_add_vid ||
6374              !dev->netdev_ops->ndo_vlan_rx_kill_vid)) {
6375                 netdev_WARN(dev, "Buggy VLAN acceleration in driver!\n");
6376                 ret = -EINVAL;
6377                 goto err_uninit;
6378         }
6379
6380         ret = -EBUSY;
6381         if (!dev->ifindex)
6382                 dev->ifindex = dev_new_index(net);
6383         else if (__dev_get_by_index(net, dev->ifindex))
6384                 goto err_uninit;
6385
6386         /* Transfer changeable features to wanted_features and enable
6387          * software offloads (GSO and GRO).
6388          */
6389         dev->hw_features |= NETIF_F_SOFT_FEATURES;
6390         dev->features |= NETIF_F_SOFT_FEATURES;
6391         dev->wanted_features = dev->features & dev->hw_features;
6392
6393         if (!(dev->flags & IFF_LOOPBACK)) {
6394                 dev->hw_features |= NETIF_F_NOCACHE_COPY;
6395         }
6396
6397         /* Make NETIF_F_HIGHDMA inheritable to VLAN devices.
6398          */
6399         dev->vlan_features |= NETIF_F_HIGHDMA;
6400
6401         /* Make NETIF_F_SG inheritable to tunnel devices.
6402          */
6403         dev->hw_enc_features |= NETIF_F_SG;
6404
6405         /* Make NETIF_F_SG inheritable to MPLS.
6406          */
6407         dev->mpls_features |= NETIF_F_SG;
6408
6409         ret = call_netdevice_notifiers(NETDEV_POST_INIT, dev);
6410         ret = notifier_to_errno(ret);
6411         if (ret)
6412                 goto err_uninit;
6413
6414         ret = netdev_register_kobject(dev);
6415         if (ret)
6416                 goto err_uninit;
6417         dev->reg_state = NETREG_REGISTERED;
6418
6419         __netdev_update_features(dev);
6420
6421         /*
6422          *      Default initial state at registry is that the
6423          *      device is present.
6424          */
6425
6426         set_bit(__LINK_STATE_PRESENT, &dev->state);
6427
6428         linkwatch_init_dev(dev);
6429
6430         dev_init_scheduler(dev);
6431         dev_hold(dev);
6432         list_netdevice(dev);
6433         add_device_randomness(dev->dev_addr, dev->addr_len);
6434
6435         /* If the device has permanent device address, driver should
6436          * set dev_addr and also addr_assign_type should be set to
6437          * NET_ADDR_PERM (default value).
6438          */
6439         if (dev->addr_assign_type == NET_ADDR_PERM)
6440                 memcpy(dev->perm_addr, dev->dev_addr, dev->addr_len);
6441
6442         /* Notify protocols, that a new device appeared. */
6443         ret = call_netdevice_notifiers(NETDEV_REGISTER, dev);
6444         ret = notifier_to_errno(ret);
6445         if (ret) {
6446                 rollback_registered(dev);
6447                 dev->reg_state = NETREG_UNREGISTERED;
6448         }
6449         /*
6450          *      Prevent userspace races by waiting until the network
6451          *      device is fully setup before sending notifications.
6452          */
6453         if (!dev->rtnl_link_ops ||
6454             dev->rtnl_link_state == RTNL_LINK_INITIALIZED)
6455                 rtmsg_ifinfo(RTM_NEWLINK, dev, ~0U, GFP_KERNEL);
6456
6457 out:
6458         return ret;
6459
6460 err_uninit:
6461         if (dev->netdev_ops->ndo_uninit)
6462                 dev->netdev_ops->ndo_uninit(dev);
6463         goto out;
6464 }
6465 EXPORT_SYMBOL(register_netdevice);
6466
6467 /**
6468  *      init_dummy_netdev       - init a dummy network device for NAPI
6469  *      @dev: device to init
6470  *
6471  *      This takes a network device structure and initialize the minimum
6472  *      amount of fields so it can be used to schedule NAPI polls without
6473  *      registering a full blown interface. This is to be used by drivers
6474  *      that need to tie several hardware interfaces to a single NAPI
6475  *      poll scheduler due to HW limitations.
6476  */
6477 int init_dummy_netdev(struct net_device *dev)
6478 {
6479         /* Clear everything. Note we don't initialize spinlocks
6480          * are they aren't supposed to be taken by any of the
6481          * NAPI code and this dummy netdev is supposed to be
6482          * only ever used for NAPI polls
6483          */
6484         memset(dev, 0, sizeof(struct net_device));
6485
6486         /* make sure we BUG if trying to hit standard
6487          * register/unregister code path
6488          */
6489         dev->reg_state = NETREG_DUMMY;
6490
6491         /* NAPI wants this */
6492         INIT_LIST_HEAD(&dev->napi_list);
6493
6494         /* a dummy interface is started by default */
6495         set_bit(__LINK_STATE_PRESENT, &dev->state);
6496         set_bit(__LINK_STATE_START, &dev->state);
6497
6498         /* Note : We dont allocate pcpu_refcnt for dummy devices,
6499          * because users of this 'device' dont need to change
6500          * its refcount.
6501          */
6502
6503         return 0;
6504 }
6505 EXPORT_SYMBOL_GPL(init_dummy_netdev);
6506
6507
6508 /**
6509  *      register_netdev - register a network device
6510  *      @dev: device to register
6511  *
6512  *      Take a completed network device structure and add it to the kernel
6513  *      interfaces. A %NETDEV_REGISTER message is sent to the netdev notifier
6514  *      chain. 0 is returned on success. A negative errno code is returned
6515  *      on a failure to set up the device, or if the name is a duplicate.
6516  *
6517  *      This is a wrapper around register_netdevice that takes the rtnl semaphore
6518  *      and expands the device name if you passed a format string to
6519  *      alloc_netdev.
6520  */
6521 int register_netdev(struct net_device *dev)
6522 {
6523         int err;
6524
6525         rtnl_lock();
6526         err = register_netdevice(dev);
6527         rtnl_unlock();
6528         return err;
6529 }
6530 EXPORT_SYMBOL(register_netdev);
6531
6532 int netdev_refcnt_read(const struct net_device *dev)
6533 {
6534         int i, refcnt = 0;
6535
6536         for_each_possible_cpu(i)
6537                 refcnt += *per_cpu_ptr(dev->pcpu_refcnt, i);
6538         return refcnt;
6539 }
6540 EXPORT_SYMBOL(netdev_refcnt_read);
6541
6542 /**
6543  * netdev_wait_allrefs - wait until all references are gone.
6544  * @dev: target net_device
6545  *
6546  * This is called when unregistering network devices.
6547  *
6548  * Any protocol or device that holds a reference should register
6549  * for netdevice notification, and cleanup and put back the
6550  * reference if they receive an UNREGISTER event.
6551  * We can get stuck here if buggy protocols don't correctly
6552  * call dev_put.
6553  */
6554 static void netdev_wait_allrefs(struct net_device *dev)
6555 {
6556         unsigned long rebroadcast_time, warning_time;
6557         int refcnt;
6558
6559         linkwatch_forget_dev(dev);
6560
6561         rebroadcast_time = warning_time = jiffies;
6562         refcnt = netdev_refcnt_read(dev);
6563
6564         while (refcnt != 0) {
6565                 if (time_after(jiffies, rebroadcast_time + 1 * HZ)) {
6566                         rtnl_lock();
6567
6568                         /* Rebroadcast unregister notification */
6569                         call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
6570
6571                         __rtnl_unlock();
6572                         rcu_barrier();
6573                         rtnl_lock();
6574
6575                         call_netdevice_notifiers(NETDEV_UNREGISTER_FINAL, dev);
6576                         if (test_bit(__LINK_STATE_LINKWATCH_PENDING,
6577                                      &dev->state)) {
6578                                 /* We must not have linkwatch events
6579                                  * pending on unregister. If this
6580                                  * happens, we simply run the queue
6581                                  * unscheduled, resulting in a noop
6582                                  * for this device.
6583                                  */
6584                                 linkwatch_run_queue();
6585                         }
6586
6587                         __rtnl_unlock();
6588
6589                         rebroadcast_time = jiffies;
6590                 }
6591
6592                 msleep(250);
6593
6594                 refcnt = netdev_refcnt_read(dev);
6595
6596                 if (time_after(jiffies, warning_time + 10 * HZ)) {
6597                         pr_emerg("unregister_netdevice: waiting for %s to become free. Usage count = %d\n",
6598                                  dev->name, refcnt);
6599                         warning_time = jiffies;
6600                 }
6601         }
6602 }
6603
6604 /* The sequence is:
6605  *
6606  *      rtnl_lock();
6607  *      ...
6608  *      register_netdevice(x1);
6609  *      register_netdevice(x2);
6610  *      ...
6611  *      unregister_netdevice(y1);
6612  *      unregister_netdevice(y2);
6613  *      ...
6614  *      rtnl_unlock();
6615  *      free_netdev(y1);
6616  *      free_netdev(y2);
6617  *
6618  * We are invoked by rtnl_unlock().
6619  * This allows us to deal with problems:
6620  * 1) We can delete sysfs objects which invoke hotplug
6621  *    without deadlocking with linkwatch via keventd.
6622  * 2) Since we run with the RTNL semaphore not held, we can sleep
6623  *    safely in order to wait for the netdev refcnt to drop to zero.
6624  *
6625  * We must not return until all unregister events added during
6626  * the interval the lock was held have been completed.
6627  */
6628 void netdev_run_todo(void)
6629 {
6630         struct list_head list;
6631
6632         /* Snapshot list, allow later requests */
6633         list_replace_init(&net_todo_list, &list);
6634
6635         __rtnl_unlock();
6636
6637
6638         /* Wait for rcu callbacks to finish before next phase */
6639         if (!list_empty(&list))
6640                 rcu_barrier();
6641
6642         while (!list_empty(&list)) {
6643                 struct net_device *dev
6644                         = list_first_entry(&list, struct net_device, todo_list);
6645                 list_del(&dev->todo_list);
6646
6647                 rtnl_lock();
6648                 call_netdevice_notifiers(NETDEV_UNREGISTER_FINAL, dev);
6649                 __rtnl_unlock();
6650
6651                 if (unlikely(dev->reg_state != NETREG_UNREGISTERING)) {
6652                         pr_err("network todo '%s' but state %d\n",
6653                                dev->name, dev->reg_state);
6654                         dump_stack();
6655                         continue;
6656                 }
6657
6658                 dev->reg_state = NETREG_UNREGISTERED;
6659
6660                 netdev_wait_allrefs(dev);
6661
6662                 /* paranoia */
6663                 BUG_ON(netdev_refcnt_read(dev));
6664                 BUG_ON(!list_empty(&dev->ptype_all));
6665                 BUG_ON(!list_empty(&dev->ptype_specific));
6666                 WARN_ON(rcu_access_pointer(dev->ip_ptr));
6667                 WARN_ON(rcu_access_pointer(dev->ip6_ptr));
6668                 WARN_ON(dev->dn_ptr);
6669
6670                 if (dev->destructor)
6671                         dev->destructor(dev);
6672
6673                 /* Report a network device has been unregistered */
6674                 rtnl_lock();
6675                 dev_net(dev)->dev_unreg_count--;
6676                 __rtnl_unlock();
6677                 wake_up(&netdev_unregistering_wq);
6678
6679                 /* Free network device */
6680                 kobject_put(&dev->dev.kobj);
6681         }
6682 }
6683
6684 /* Convert net_device_stats to rtnl_link_stats64.  They have the same
6685  * fields in the same order, with only the type differing.
6686  */
6687 void netdev_stats_to_stats64(struct rtnl_link_stats64 *stats64,
6688                              const struct net_device_stats *netdev_stats)
6689 {
6690 #if BITS_PER_LONG == 64
6691         BUILD_BUG_ON(sizeof(*stats64) != sizeof(*netdev_stats));
6692         memcpy(stats64, netdev_stats, sizeof(*stats64));
6693 #else
6694         size_t i, n = sizeof(*stats64) / sizeof(u64);
6695         const unsigned long *src = (const unsigned long *)netdev_stats;
6696         u64 *dst = (u64 *)stats64;
6697
6698         BUILD_BUG_ON(sizeof(*netdev_stats) / sizeof(unsigned long) !=
6699                      sizeof(*stats64) / sizeof(u64));
6700         for (i = 0; i < n; i++)
6701                 dst[i] = src[i];
6702 #endif
6703 }
6704 EXPORT_SYMBOL(netdev_stats_to_stats64);
6705
6706 /**
6707  *      dev_get_stats   - get network device statistics
6708  *      @dev: device to get statistics from
6709  *      @storage: place to store stats
6710  *
6711  *      Get network statistics from device. Return @storage.
6712  *      The device driver may provide its own method by setting
6713  *      dev->netdev_ops->get_stats64 or dev->netdev_ops->get_stats;
6714  *      otherwise the internal statistics structure is used.
6715  */
6716 struct rtnl_link_stats64 *dev_get_stats(struct net_device *dev,
6717                                         struct rtnl_link_stats64 *storage)
6718 {
6719         const struct net_device_ops *ops = dev->netdev_ops;
6720
6721         if (ops->ndo_get_stats64) {
6722                 memset(storage, 0, sizeof(*storage));
6723                 ops->ndo_get_stats64(dev, storage);
6724         } else if (ops->ndo_get_stats) {
6725                 netdev_stats_to_stats64(storage, ops->ndo_get_stats(dev));
6726         } else {
6727                 netdev_stats_to_stats64(storage, &dev->stats);
6728         }
6729         storage->rx_dropped += atomic_long_read(&dev->rx_dropped);
6730         storage->tx_dropped += atomic_long_read(&dev->tx_dropped);
6731         return storage;
6732 }
6733 EXPORT_SYMBOL(dev_get_stats);
6734
6735 struct netdev_queue *dev_ingress_queue_create(struct net_device *dev)
6736 {
6737         struct netdev_queue *queue = dev_ingress_queue(dev);
6738
6739 #ifdef CONFIG_NET_CLS_ACT
6740         if (queue)
6741                 return queue;
6742         queue = kzalloc(sizeof(*queue), GFP_KERNEL);
6743         if (!queue)
6744                 return NULL;
6745         netdev_init_one_queue(dev, queue, NULL);
6746         RCU_INIT_POINTER(queue->qdisc, &noop_qdisc);
6747         queue->qdisc_sleeping = &noop_qdisc;
6748         rcu_assign_pointer(dev->ingress_queue, queue);
6749 #endif
6750         return queue;
6751 }
6752
6753 static const struct ethtool_ops default_ethtool_ops;
6754
6755 void netdev_set_default_ethtool_ops(struct net_device *dev,
6756                                     const struct ethtool_ops *ops)
6757 {
6758         if (dev->ethtool_ops == &default_ethtool_ops)
6759                 dev->ethtool_ops = ops;
6760 }
6761 EXPORT_SYMBOL_GPL(netdev_set_default_ethtool_ops);
6762
6763 void netdev_freemem(struct net_device *dev)
6764 {
6765         char *addr = (char *)dev - dev->padded;
6766
6767         kvfree(addr);
6768 }
6769
6770 /**
6771  *      alloc_netdev_mqs - allocate network device
6772  *      @sizeof_priv:           size of private data to allocate space for
6773  *      @name:                  device name format string
6774  *      @name_assign_type:      origin of device name
6775  *      @setup:                 callback to initialize device
6776  *      @txqs:                  the number of TX subqueues to allocate
6777  *      @rxqs:                  the number of RX subqueues to allocate
6778  *
6779  *      Allocates a struct net_device with private data area for driver use
6780  *      and performs basic initialization.  Also allocates subqueue structs
6781  *      for each queue on the device.
6782  */
6783 struct net_device *alloc_netdev_mqs(int sizeof_priv, const char *name,
6784                 unsigned char name_assign_type,
6785                 void (*setup)(struct net_device *),
6786                 unsigned int txqs, unsigned int rxqs)
6787 {
6788         struct net_device *dev;
6789         size_t alloc_size;
6790         struct net_device *p;
6791
6792         BUG_ON(strlen(name) >= sizeof(dev->name));
6793
6794         if (txqs < 1) {
6795                 pr_err("alloc_netdev: Unable to allocate device with zero queues\n");
6796                 return NULL;
6797         }
6798
6799 #ifdef CONFIG_SYSFS
6800         if (rxqs < 1) {
6801                 pr_err("alloc_netdev: Unable to allocate device with zero RX queues\n");
6802                 return NULL;
6803         }
6804 #endif
6805
6806         alloc_size = sizeof(struct net_device);
6807         if (sizeof_priv) {
6808                 /* ensure 32-byte alignment of private area */
6809                 alloc_size = ALIGN(alloc_size, NETDEV_ALIGN);
6810                 alloc_size += sizeof_priv;
6811         }
6812         /* ensure 32-byte alignment of whole construct */
6813         alloc_size += NETDEV_ALIGN - 1;
6814
6815         p = kzalloc(alloc_size, GFP_KERNEL | __GFP_NOWARN | __GFP_REPEAT);
6816         if (!p)
6817                 p = vzalloc(alloc_size);
6818         if (!p)
6819                 return NULL;
6820
6821         dev = PTR_ALIGN(p, NETDEV_ALIGN);
6822         dev->padded = (char *)dev - (char *)p;
6823
6824         dev->pcpu_refcnt = alloc_percpu(int);
6825         if (!dev->pcpu_refcnt)
6826                 goto free_dev;
6827
6828         if (dev_addr_init(dev))
6829                 goto free_pcpu;
6830
6831         dev_mc_init(dev);
6832         dev_uc_init(dev);
6833
6834         dev_net_set(dev, &init_net);
6835
6836         dev->gso_max_size = GSO_MAX_SIZE;
6837         dev->gso_max_segs = GSO_MAX_SEGS;
6838         dev->gso_min_segs = 0;
6839
6840         INIT_LIST_HEAD(&dev->napi_list);
6841         INIT_LIST_HEAD(&dev->unreg_list);
6842         INIT_LIST_HEAD(&dev->close_list);
6843         INIT_LIST_HEAD(&dev->link_watch_list);
6844         INIT_LIST_HEAD(&dev->adj_list.upper);
6845         INIT_LIST_HEAD(&dev->adj_list.lower);
6846         INIT_LIST_HEAD(&dev->all_adj_list.upper);
6847         INIT_LIST_HEAD(&dev->all_adj_list.lower);
6848         INIT_LIST_HEAD(&dev->ptype_all);
6849         INIT_LIST_HEAD(&dev->ptype_specific);
6850         dev->priv_flags = IFF_XMIT_DST_RELEASE | IFF_XMIT_DST_RELEASE_PERM;
6851         setup(dev);
6852
6853         dev->num_tx_queues = txqs;
6854         dev->real_num_tx_queues = txqs;
6855         if (netif_alloc_netdev_queues(dev))
6856                 goto free_all;
6857
6858 #ifdef CONFIG_SYSFS
6859         dev->num_rx_queues = rxqs;
6860         dev->real_num_rx_queues = rxqs;
6861         if (netif_alloc_rx_queues(dev))
6862                 goto free_all;
6863 #endif
6864
6865         strcpy(dev->name, name);
6866         dev->name_assign_type = name_assign_type;
6867         dev->group = INIT_NETDEV_GROUP;
6868         if (!dev->ethtool_ops)
6869                 dev->ethtool_ops = &default_ethtool_ops;
6870         return dev;
6871
6872 free_all:
6873         free_netdev(dev);
6874         return NULL;
6875
6876 free_pcpu:
6877         free_percpu(dev->pcpu_refcnt);
6878 free_dev:
6879         netdev_freemem(dev);
6880         return NULL;
6881 }
6882 EXPORT_SYMBOL(alloc_netdev_mqs);
6883
6884 /**
6885  *      free_netdev - free network device
6886  *      @dev: device
6887  *
6888  *      This function does the last stage of destroying an allocated device
6889  *      interface. The reference to the device object is released.
6890  *      If this is the last reference then it will be freed.
6891  */
6892 void free_netdev(struct net_device *dev)
6893 {
6894         struct napi_struct *p, *n;
6895
6896         netif_free_tx_queues(dev);
6897 #ifdef CONFIG_SYSFS
6898         kvfree(dev->_rx);
6899 #endif
6900
6901         kfree(rcu_dereference_protected(dev->ingress_queue, 1));
6902
6903         /* Flush device addresses */
6904         dev_addr_flush(dev);
6905
6906         list_for_each_entry_safe(p, n, &dev->napi_list, dev_list)
6907                 netif_napi_del(p);
6908
6909         free_percpu(dev->pcpu_refcnt);
6910         dev->pcpu_refcnt = NULL;
6911
6912         /*  Compatibility with error handling in drivers */
6913         if (dev->reg_state == NETREG_UNINITIALIZED) {
6914                 netdev_freemem(dev);
6915                 return;
6916         }
6917
6918         BUG_ON(dev->reg_state != NETREG_UNREGISTERED);
6919         dev->reg_state = NETREG_RELEASED;
6920
6921         /* will free via device release */
6922         put_device(&dev->dev);
6923 }
6924 EXPORT_SYMBOL(free_netdev);
6925
6926 /**
6927  *      synchronize_net -  Synchronize with packet receive processing
6928  *
6929  *      Wait for packets currently being received to be done.
6930  *      Does not block later packets from starting.
6931  */
6932 void synchronize_net(void)
6933 {
6934         might_sleep();
6935         if (rtnl_is_locked())
6936                 synchronize_rcu_expedited();
6937         else
6938                 synchronize_rcu();
6939 }
6940 EXPORT_SYMBOL(synchronize_net);
6941
6942 /**
6943  *      unregister_netdevice_queue - remove device from the kernel
6944  *      @dev: device
6945  *      @head: list
6946  *
6947  *      This function shuts down a device interface and removes it
6948  *      from the kernel tables.
6949  *      If head not NULL, device is queued to be unregistered later.
6950  *
6951  *      Callers must hold the rtnl semaphore.  You may want
6952  *      unregister_netdev() instead of this.
6953  */
6954
6955 void unregister_netdevice_queue(struct net_device *dev, struct list_head *head)
6956 {
6957         ASSERT_RTNL();
6958
6959         if (head) {
6960                 list_move_tail(&dev->unreg_list, head);
6961         } else {
6962                 rollback_registered(dev);
6963                 /* Finish processing unregister after unlock */
6964                 net_set_todo(dev);
6965         }
6966 }
6967 EXPORT_SYMBOL(unregister_netdevice_queue);
6968
6969 /**
6970  *      unregister_netdevice_many - unregister many devices
6971  *      @head: list of devices
6972  *
6973  *  Note: As most callers use a stack allocated list_head,
6974  *  we force a list_del() to make sure stack wont be corrupted later.
6975  */
6976 void unregister_netdevice_many(struct list_head *head)
6977 {
6978         struct net_device *dev;
6979
6980         if (!list_empty(head)) {
6981                 rollback_registered_many(head);
6982                 list_for_each_entry(dev, head, unreg_list)
6983                         net_set_todo(dev);
6984                 list_del(head);
6985         }
6986 }
6987 EXPORT_SYMBOL(unregister_netdevice_many);
6988
6989 /**
6990  *      unregister_netdev - remove device from the kernel
6991  *      @dev: device
6992  *
6993  *      This function shuts down a device interface and removes it
6994  *      from the kernel tables.
6995  *
6996  *      This is just a wrapper for unregister_netdevice that takes
6997  *      the rtnl semaphore.  In general you want to use this and not
6998  *      unregister_netdevice.
6999  */
7000 void unregister_netdev(struct net_device *dev)
7001 {
7002         rtnl_lock();
7003         unregister_netdevice(dev);
7004         rtnl_unlock();
7005 }
7006 EXPORT_SYMBOL(unregister_netdev);
7007
7008 /**
7009  *      dev_change_net_namespace - move device to different nethost namespace
7010  *      @dev: device
7011  *      @net: network namespace
7012  *      @pat: If not NULL name pattern to try if the current device name
7013  *            is already taken in the destination network namespace.
7014  *
7015  *      This function shuts down a device interface and moves it
7016  *      to a new network namespace. On success 0 is returned, on
7017  *      a failure a netagive errno code is returned.
7018  *
7019  *      Callers must hold the rtnl semaphore.
7020  */
7021
7022 int dev_change_net_namespace(struct net_device *dev, struct net *net, const char *pat)
7023 {
7024         int err;
7025
7026         ASSERT_RTNL();
7027
7028         /* Don't allow namespace local devices to be moved. */
7029         err = -EINVAL;
7030         if (dev->features & NETIF_F_NETNS_LOCAL)
7031                 goto out;
7032
7033         /* Ensure the device has been registrered */
7034         if (dev->reg_state != NETREG_REGISTERED)
7035                 goto out;
7036
7037         /* Get out if there is nothing todo */
7038         err = 0;
7039         if (net_eq(dev_net(dev), net))
7040                 goto out;
7041
7042         /* Pick the destination device name, and ensure
7043          * we can use it in the destination network namespace.
7044          */
7045         err = -EEXIST;
7046         if (__dev_get_by_name(net, dev->name)) {
7047                 /* We get here if we can't use the current device name */
7048                 if (!pat)
7049                         goto out;
7050                 if (dev_get_valid_name(net, dev, pat) < 0)
7051                         goto out;
7052         }
7053
7054         /*
7055          * And now a mini version of register_netdevice unregister_netdevice.
7056          */
7057
7058         /* If device is running close it first. */
7059         dev_close(dev);
7060
7061         /* And unlink it from device chain */
7062         err = -ENODEV;
7063         unlist_netdevice(dev);
7064
7065         synchronize_net();
7066
7067         /* Shutdown queueing discipline. */
7068         dev_shutdown(dev);
7069
7070         /* Notify protocols, that we are about to destroy
7071            this device. They should clean all the things.
7072
7073            Note that dev->reg_state stays at NETREG_REGISTERED.
7074            This is wanted because this way 8021q and macvlan know
7075            the device is just moving and can keep their slaves up.
7076         */
7077         call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
7078         rcu_barrier();
7079         call_netdevice_notifiers(NETDEV_UNREGISTER_FINAL, dev);
7080         rtmsg_ifinfo(RTM_DELLINK, dev, ~0U, GFP_KERNEL);
7081
7082         /*
7083          *      Flush the unicast and multicast chains
7084          */
7085         dev_uc_flush(dev);
7086         dev_mc_flush(dev);
7087
7088         /* Send a netdev-removed uevent to the old namespace */
7089         kobject_uevent(&dev->dev.kobj, KOBJ_REMOVE);
7090         netdev_adjacent_del_links(dev);
7091
7092         /* Actually switch the network namespace */
7093         dev_net_set(dev, net);
7094
7095         /* If there is an ifindex conflict assign a new one */
7096         if (__dev_get_by_index(net, dev->ifindex))
7097                 dev->ifindex = dev_new_index(net);
7098
7099         /* Send a netdev-add uevent to the new namespace */
7100         kobject_uevent(&dev->dev.kobj, KOBJ_ADD);
7101         netdev_adjacent_add_links(dev);
7102
7103         /* Fixup kobjects */
7104         err = device_rename(&dev->dev, dev->name);
7105         WARN_ON(err);
7106
7107         /* Add the device back in the hashes */
7108         list_netdevice(dev);
7109
7110         /* Notify protocols, that a new device appeared. */
7111         call_netdevice_notifiers(NETDEV_REGISTER, dev);
7112
7113         /*
7114          *      Prevent userspace races by waiting until the network
7115          *      device is fully setup before sending notifications.
7116          */
7117         rtmsg_ifinfo(RTM_NEWLINK, dev, ~0U, GFP_KERNEL);
7118
7119         synchronize_net();
7120         err = 0;
7121 out:
7122         return err;
7123 }
7124 EXPORT_SYMBOL_GPL(dev_change_net_namespace);
7125
7126 static int dev_cpu_callback(struct notifier_block *nfb,
7127                             unsigned long action,
7128                             void *ocpu)
7129 {
7130         struct sk_buff **list_skb;
7131         struct sk_buff *skb;
7132         unsigned int cpu, oldcpu = (unsigned long)ocpu;
7133         struct softnet_data *sd, *oldsd;
7134
7135         if (action != CPU_DEAD && action != CPU_DEAD_FROZEN)
7136                 return NOTIFY_OK;
7137
7138         local_irq_disable();
7139         cpu = smp_processor_id();
7140         sd = &per_cpu(softnet_data, cpu);
7141         oldsd = &per_cpu(softnet_data, oldcpu);
7142
7143         /* Find end of our completion_queue. */
7144         list_skb = &sd->completion_queue;
7145         while (*list_skb)
7146                 list_skb = &(*list_skb)->next;
7147         /* Append completion queue from offline CPU. */
7148         *list_skb = oldsd->completion_queue;
7149         oldsd->completion_queue = NULL;
7150
7151         /* Append output queue from offline CPU. */
7152         if (oldsd->output_queue) {
7153                 *sd->output_queue_tailp = oldsd->output_queue;
7154                 sd->output_queue_tailp = oldsd->output_queue_tailp;
7155                 oldsd->output_queue = NULL;
7156                 oldsd->output_queue_tailp = &oldsd->output_queue;
7157         }
7158         /* Append NAPI poll list from offline CPU, with one exception :
7159          * process_backlog() must be called by cpu owning percpu backlog.
7160          * We properly handle process_queue & input_pkt_queue later.
7161          */
7162         while (!list_empty(&oldsd->poll_list)) {
7163                 struct napi_struct *napi = list_first_entry(&oldsd->poll_list,
7164                                                             struct napi_struct,
7165                                                             poll_list);
7166
7167                 list_del_init(&napi->poll_list);
7168                 if (napi->poll == process_backlog)
7169                         napi->state = 0;
7170                 else
7171                         ____napi_schedule(sd, napi);
7172         }
7173
7174         raise_softirq_irqoff(NET_TX_SOFTIRQ);
7175         local_irq_enable();
7176
7177         /* Process offline CPU's input_pkt_queue */
7178         while ((skb = __skb_dequeue(&oldsd->process_queue))) {
7179                 netif_rx_ni(skb);
7180                 input_queue_head_incr(oldsd);
7181         }
7182         while ((skb = skb_dequeue(&oldsd->input_pkt_queue))) {
7183                 netif_rx_ni(skb);
7184                 input_queue_head_incr(oldsd);
7185         }
7186
7187         return NOTIFY_OK;
7188 }
7189
7190
7191 /**
7192  *      netdev_increment_features - increment feature set by one
7193  *      @all: current feature set
7194  *      @one: new feature set
7195  *      @mask: mask feature set
7196  *
7197  *      Computes a new feature set after adding a device with feature set
7198  *      @one to the master device with current feature set @all.  Will not
7199  *      enable anything that is off in @mask. Returns the new feature set.
7200  */
7201 netdev_features_t netdev_increment_features(netdev_features_t all,
7202         netdev_features_t one, netdev_features_t mask)
7203 {
7204         if (mask & NETIF_F_GEN_CSUM)
7205                 mask |= NETIF_F_ALL_CSUM;
7206         mask |= NETIF_F_VLAN_CHALLENGED;
7207
7208         all |= one & (NETIF_F_ONE_FOR_ALL|NETIF_F_ALL_CSUM) & mask;
7209         all &= one | ~NETIF_F_ALL_FOR_ALL;
7210
7211         /* If one device supports hw checksumming, set for all. */
7212         if (all & NETIF_F_GEN_CSUM)
7213                 all &= ~(NETIF_F_ALL_CSUM & ~NETIF_F_GEN_CSUM);
7214
7215         return all;
7216 }
7217 EXPORT_SYMBOL(netdev_increment_features);
7218
7219 static struct hlist_head * __net_init netdev_create_hash(void)
7220 {
7221         int i;
7222         struct hlist_head *hash;
7223
7224         hash = kmalloc(sizeof(*hash) * NETDEV_HASHENTRIES, GFP_KERNEL);
7225         if (hash != NULL)
7226                 for (i = 0; i < NETDEV_HASHENTRIES; i++)
7227                         INIT_HLIST_HEAD(&hash[i]);
7228
7229         return hash;
7230 }
7231
7232 /* Initialize per network namespace state */
7233 static int __net_init netdev_init(struct net *net)
7234 {
7235         if (net != &init_net)
7236                 INIT_LIST_HEAD(&net->dev_base_head);
7237
7238         net->dev_name_head = netdev_create_hash();
7239         if (net->dev_name_head == NULL)
7240                 goto err_name;
7241
7242         net->dev_index_head = netdev_create_hash();
7243         if (net->dev_index_head == NULL)
7244                 goto err_idx;
7245
7246         return 0;
7247
7248 err_idx:
7249         kfree(net->dev_name_head);
7250 err_name:
7251         return -ENOMEM;
7252 }
7253
7254 /**
7255  *      netdev_drivername - network driver for the device
7256  *      @dev: network device
7257  *
7258  *      Determine network driver for device.
7259  */
7260 const char *netdev_drivername(const struct net_device *dev)
7261 {
7262         const struct device_driver *driver;
7263         const struct device *parent;
7264         const char *empty = "";
7265
7266         parent = dev->dev.parent;
7267         if (!parent)
7268                 return empty;
7269
7270         driver = parent->driver;
7271         if (driver && driver->name)
7272                 return driver->name;
7273         return empty;
7274 }
7275
7276 static void __netdev_printk(const char *level, const struct net_device *dev,
7277                             struct va_format *vaf)
7278 {
7279         if (dev && dev->dev.parent) {
7280                 dev_printk_emit(level[1] - '0',
7281                                 dev->dev.parent,
7282                                 "%s %s %s%s: %pV",
7283                                 dev_driver_string(dev->dev.parent),
7284                                 dev_name(dev->dev.parent),
7285                                 netdev_name(dev), netdev_reg_state(dev),
7286                                 vaf);
7287         } else if (dev) {
7288                 printk("%s%s%s: %pV",
7289                        level, netdev_name(dev), netdev_reg_state(dev), vaf);
7290         } else {
7291                 printk("%s(NULL net_device): %pV", level, vaf);
7292         }
7293 }
7294
7295 void netdev_printk(const char *level, const struct net_device *dev,
7296                    const char *format, ...)
7297 {
7298         struct va_format vaf;
7299         va_list args;
7300
7301         va_start(args, format);
7302
7303         vaf.fmt = format;
7304         vaf.va = &args;
7305
7306         __netdev_printk(level, dev, &vaf);
7307
7308         va_end(args);
7309 }
7310 EXPORT_SYMBOL(netdev_printk);
7311
7312 #define define_netdev_printk_level(func, level)                 \
7313 void func(const struct net_device *dev, const char *fmt, ...)   \
7314 {                                                               \
7315         struct va_format vaf;                                   \
7316         va_list args;                                           \
7317                                                                 \
7318         va_start(args, fmt);                                    \
7319                                                                 \
7320         vaf.fmt = fmt;                                          \
7321         vaf.va = &args;                                         \
7322                                                                 \
7323         __netdev_printk(level, dev, &vaf);                      \
7324                                                                 \
7325         va_end(args);                                           \
7326 }                                                               \
7327 EXPORT_SYMBOL(func);
7328
7329 define_netdev_printk_level(netdev_emerg, KERN_EMERG);
7330 define_netdev_printk_level(netdev_alert, KERN_ALERT);
7331 define_netdev_printk_level(netdev_crit, KERN_CRIT);
7332 define_netdev_printk_level(netdev_err, KERN_ERR);
7333 define_netdev_printk_level(netdev_warn, KERN_WARNING);
7334 define_netdev_printk_level(netdev_notice, KERN_NOTICE);
7335 define_netdev_printk_level(netdev_info, KERN_INFO);
7336
7337 static void __net_exit netdev_exit(struct net *net)
7338 {
7339         kfree(net->dev_name_head);
7340         kfree(net->dev_index_head);
7341 }
7342
7343 static struct pernet_operations __net_initdata netdev_net_ops = {
7344         .init = netdev_init,
7345         .exit = netdev_exit,
7346 };
7347
7348 static void __net_exit default_device_exit(struct net *net)
7349 {
7350         struct net_device *dev, *aux;
7351         /*
7352          * Push all migratable network devices back to the
7353          * initial network namespace
7354          */
7355         rtnl_lock();
7356         for_each_netdev_safe(net, dev, aux) {
7357                 int err;
7358                 char fb_name[IFNAMSIZ];
7359
7360                 /* Ignore unmoveable devices (i.e. loopback) */
7361                 if (dev->features & NETIF_F_NETNS_LOCAL)
7362                         continue;
7363
7364                 /* Leave virtual devices for the generic cleanup */
7365                 if (dev->rtnl_link_ops)
7366                         continue;
7367
7368                 /* Push remaining network devices to init_net */
7369                 snprintf(fb_name, IFNAMSIZ, "dev%d", dev->ifindex);
7370                 err = dev_change_net_namespace(dev, &init_net, fb_name);
7371                 if (err) {
7372                         pr_emerg("%s: failed to move %s to init_net: %d\n",
7373                                  __func__, dev->name, err);
7374                         BUG();
7375                 }
7376         }
7377         rtnl_unlock();
7378 }
7379
7380 static void __net_exit rtnl_lock_unregistering(struct list_head *net_list)
7381 {
7382         /* Return with the rtnl_lock held when there are no network
7383          * devices unregistering in any network namespace in net_list.
7384          */
7385         struct net *net;
7386         bool unregistering;
7387         DEFINE_WAIT_FUNC(wait, woken_wake_function);
7388
7389         add_wait_queue(&netdev_unregistering_wq, &wait);
7390         for (;;) {
7391                 unregistering = false;
7392                 rtnl_lock();
7393                 list_for_each_entry(net, net_list, exit_list) {
7394                         if (net->dev_unreg_count > 0) {
7395                                 unregistering = true;
7396                                 break;
7397                         }
7398                 }
7399                 if (!unregistering)
7400                         break;
7401                 __rtnl_unlock();
7402
7403                 wait_woken(&wait, TASK_UNINTERRUPTIBLE, MAX_SCHEDULE_TIMEOUT);
7404         }
7405         remove_wait_queue(&netdev_unregistering_wq, &wait);
7406 }
7407
7408 static void __net_exit default_device_exit_batch(struct list_head *net_list)
7409 {
7410         /* At exit all network devices most be removed from a network
7411          * namespace.  Do this in the reverse order of registration.
7412          * Do this across as many network namespaces as possible to
7413          * improve batching efficiency.
7414          */
7415         struct net_device *dev;
7416         struct net *net;
7417         LIST_HEAD(dev_kill_list);
7418
7419         /* To prevent network device cleanup code from dereferencing
7420          * loopback devices or network devices that have been freed
7421          * wait here for all pending unregistrations to complete,
7422          * before unregistring the loopback device and allowing the
7423          * network namespace be freed.
7424          *
7425          * The netdev todo list containing all network devices
7426          * unregistrations that happen in default_device_exit_batch
7427          * will run in the rtnl_unlock() at the end of
7428          * default_device_exit_batch.
7429          */
7430         rtnl_lock_unregistering(net_list);
7431         list_for_each_entry(net, net_list, exit_list) {
7432                 for_each_netdev_reverse(net, dev) {
7433                         if (dev->rtnl_link_ops && dev->rtnl_link_ops->dellink)
7434                                 dev->rtnl_link_ops->dellink(dev, &dev_kill_list);
7435                         else
7436                                 unregister_netdevice_queue(dev, &dev_kill_list);
7437                 }
7438         }
7439         unregister_netdevice_many(&dev_kill_list);
7440         rtnl_unlock();
7441 }
7442
7443 static struct pernet_operations __net_initdata default_device_ops = {
7444         .exit = default_device_exit,
7445         .exit_batch = default_device_exit_batch,
7446 };
7447
7448 /*
7449  *      Initialize the DEV module. At boot time this walks the device list and
7450  *      unhooks any devices that fail to initialise (normally hardware not
7451  *      present) and leaves us with a valid list of present and active devices.
7452  *
7453  */
7454
7455 /*
7456  *       This is called single threaded during boot, so no need
7457  *       to take the rtnl semaphore.
7458  */
7459 static int __init net_dev_init(void)
7460 {
7461         int i, rc = -ENOMEM;
7462
7463         BUG_ON(!dev_boot_phase);
7464
7465         if (dev_proc_init())
7466                 goto out;
7467
7468         if (netdev_kobject_init())
7469                 goto out;
7470
7471         INIT_LIST_HEAD(&ptype_all);
7472         for (i = 0; i < PTYPE_HASH_SIZE; i++)
7473                 INIT_LIST_HEAD(&ptype_base[i]);
7474
7475         INIT_LIST_HEAD(&offload_base);
7476
7477         if (register_pernet_subsys(&netdev_net_ops))
7478                 goto out;
7479
7480         /*
7481          *      Initialise the packet receive queues.
7482          */
7483
7484         for_each_possible_cpu(i) {
7485                 struct softnet_data *sd = &per_cpu(softnet_data, i);
7486
7487                 skb_queue_head_init(&sd->input_pkt_queue);
7488                 skb_queue_head_init(&sd->process_queue);
7489                 INIT_LIST_HEAD(&sd->poll_list);
7490                 sd->output_queue_tailp = &sd->output_queue;
7491 #ifdef CONFIG_RPS
7492                 sd->csd.func = rps_trigger_softirq;
7493                 sd->csd.info = sd;
7494                 sd->cpu = i;
7495 #endif
7496
7497                 sd->backlog.poll = process_backlog;
7498                 sd->backlog.weight = weight_p;
7499         }
7500
7501         dev_boot_phase = 0;
7502
7503         /* The loopback device is special if any other network devices
7504          * is present in a network namespace the loopback device must
7505          * be present. Since we now dynamically allocate and free the
7506          * loopback device ensure this invariant is maintained by
7507          * keeping the loopback device as the first device on the
7508          * list of network devices.  Ensuring the loopback devices
7509          * is the first device that appears and the last network device
7510          * that disappears.
7511          */
7512         if (register_pernet_device(&loopback_net_ops))
7513                 goto out;
7514
7515         if (register_pernet_device(&default_device_ops))
7516                 goto out;
7517
7518         open_softirq(NET_TX_SOFTIRQ, net_tx_action);
7519         open_softirq(NET_RX_SOFTIRQ, net_rx_action);
7520
7521         hotcpu_notifier(dev_cpu_callback, 0);
7522         dst_init();
7523         rc = 0;
7524 out:
7525         return rc;
7526 }
7527
7528 subsys_initcall(net_dev_init);