net/core/dev.c

   1 /*
   2  *      NET3    Protocol independent device support routines.
   3  *
   4  *              This program is free software; you can redistribute it and/or
   5  *              modify it under the terms of the GNU General Public License
   6  *              as published by the Free Software Foundation; either version
   7  *              2 of the License, or (at your option) any later version.
   8  *
   9  *      Derived from the non IP parts of dev.c 1.0.19
  10  *              Authors:        Ross Biro
  11  *                              Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
  12  *                              Mark Evans, <evansmp@uhura.aston.ac.uk>
  13  *
  14  *      Additional Authors:
  15  *              Florian la Roche <rzsfl@rz.uni-sb.de>
  16  *              Alan Cox <gw4pts@gw4pts.ampr.org>
  17  *              David Hinds <dahinds@users.sourceforge.net>
  18  *              Alexey Kuznetsov <kuznet@ms2.inr.ac.ru>
  19  *              Adam Sulmicki <adam@cfar.umd.edu>
  20  *              Pekka Riikonen <priikone@poesidon.pspt.fi>
  21  *
  22  *      Changes:
  23  *              D.J. Barrow     :       Fixed bug where dev->refcnt gets set
  24  *                                      to 2 if register_netdev gets called
  25  *                                      before net_dev_init & also removed a
  26  *                                      few lines of code in the process.
  27  *              Alan Cox        :       device private ioctl copies fields back.
  28  *              Alan Cox        :       Transmit queue code does relevant
  29  *                                      stunts to keep the queue safe.
  30  *              Alan Cox        :       Fixed double lock.
  31  *              Alan Cox        :       Fixed promisc NULL pointer trap
  32  *              ????????        :       Support the full private ioctl range
  33  *              Alan Cox        :       Moved ioctl permission check into
  34  *                                      drivers
  35  *              Tim Kordas      :       SIOCADDMULTI/SIOCDELMULTI
  36  *              Alan Cox        :       100 backlog just doesn't cut it when
  37  *                                      you start doing multicast video 8)
  38  *              Alan Cox        :       Rewrote net_bh and list manager.
  39  *              Alan Cox        :       Fix ETH_P_ALL echoback lengths.
  40  *              Alan Cox        :       Took out transmit every packet pass
  41  *                                      Saved a few bytes in the ioctl handler
  42  *              Alan Cox        :       Network driver sets packet type before
  43  *                                      calling netif_rx. Saves a function
  44  *                                      call a packet.
  45  *              Alan Cox        :       Hashed net_bh()
  46  *              Richard Kooijman:       Timestamp fixes.
  47  *              Alan Cox        :       Wrong field in SIOCGIFDSTADDR
  48  *              Alan Cox        :       Device lock protection.
  49  *              Alan Cox        :       Fixed nasty side effect of device close
  50  *                                      changes.
  51  *              Rudi Cilibrasi  :       Pass the right thing to
  52  *                                      set_mac_address()
  53  *              Dave Miller     :       32bit quantity for the device lock to
  54  *                                      make it work out on a Sparc.
  55  *              Bjorn Ekwall    :       Added KERNELD hack.
  56  *              Alan Cox        :       Cleaned up the backlog initialise.
  57  *              Craig Metz      :       SIOCGIFCONF fix if space for under
  58  *                                      1 device.
  59  *          Thomas Bogendoerfer :       Return ENODEV for dev_open, if there
  60  *                                      is no device open function.
  61  *              Andi Kleen      :       Fix error reporting for SIOCGIFCONF
  62  *          Michael Chastain    :       Fix signed/unsigned for SIOCGIFCONF
  63  *              Cyrus Durgin    :       Cleaned for KMOD
  64  *              Adam Sulmicki   :       Bug Fix : Network Device Unload
  65  *                                      A network device unload needs to purge
  66  *                                      the backlog queue.
  67  *      Paul Rusty Russell      :       SIOCSIFNAME
  68  *              Pekka Riikonen  :       Netdev boot-time settings code
  69  *              Andrew Morton   :       Make unregister_netdevice wait
  70  *                                      indefinitely on dev->refcnt
  71  *              J Hadi Salim    :       - Backlog queue sampling
  72  *                                      - netif_rx() feedback
  73  */
  74
  75 #include <asm/uaccess.h>
  76 #include <linux/bitops.h>
  77 #include <linux/capability.h>
  78 #include <linux/cpu.h>
  79 #include <linux/types.h>
  80 #include <linux/kernel.h>
  81 #include <linux/hash.h>
  82 #include <linux/slab.h>
  83 #include <linux/sched.h>
  84 #include <linux/mutex.h>
  85 #include <linux/string.h>
  86 #include <linux/mm.h>
  87 #include <linux/socket.h>
  88 #include <linux/sockios.h>
  89 #include <linux/errno.h>
  90 #include <linux/interrupt.h>
  91 #include <linux/if_ether.h>
  92 #include <linux/netdevice.h>
  93 #include <linux/etherdevice.h>
  94 #include <linux/ethtool.h>
  95 #include <linux/notifier.h>
  96 #include <linux/skbuff.h>
  97 #include <net/net_namespace.h>
  98 #include <net/sock.h>
  99 #include <linux/rtnetlink.h>
 100 #include <linux/stat.h>
 101 #include <net/dst.h>
 102 #include <net/dst_metadata.h>
 103 #include <net/pkt_sched.h>
 104 #include <net/checksum.h>
 105 #include <net/xfrm.h>
 106 #include <linux/highmem.h>
 107 #include <linux/init.h>
 108 #include <linux/module.h>
 109 #include <linux/netpoll.h>
 110 #include <linux/rcupdate.h>
 111 #include <linux/delay.h>
 112 #include <net/iw_handler.h>
 113 #include <asm/current.h>
 114 #include <linux/audit.h>
 115 #include <linux/dmaengine.h>
 116 #include <linux/err.h>
 117 #include <linux/ctype.h>
 118 #include <linux/if_arp.h>
 119 #include <linux/if_vlan.h>
 120 #include <linux/ip.h>
 121 #include <net/ip.h>
 122 #include <net/mpls.h>
 123 #include <linux/ipv6.h>
 124 #include <linux/in.h>
 125 #include <linux/jhash.h>
 126 #include <linux/random.h>
 127 #include <trace/events/napi.h>
 128 #include <trace/events/net.h>
 129 #include <trace/events/skb.h>
 130 #include <linux/pci.h>
 131 #include <linux/inetdevice.h>
 132 #include <linux/cpu_rmap.h>
 133 #include <linux/static_key.h>
 134 #include <linux/hashtable.h>
 135 #include <linux/vmalloc.h>
 136 #include <linux/if_macvlan.h>
 137 #include <linux/errqueue.h>
 138 #include <linux/hrtimer.h>
 139 #include <linux/netfilter_ingress.h>
 140
 141 #include "net-sysfs.h"
 142
 143 /* Instead of increasing this, you should create a hash table. */
 144 #define MAX_GRO_SKBS 8
 145
 146 /* This should be increased if a protocol with a bigger head is added. */
 147 #define GRO_MAX_HEAD (MAX_HEADER + 128)
 148
 149 static DEFINE_SPINLOCK(ptype_lock);
 150 static DEFINE_SPINLOCK(offload_lock);
 151 struct list_head ptype_base[PTYPE_HASH_SIZE] __read_mostly;
 152 struct list_head ptype_all __read_mostly;       /* Taps */
 153 static struct list_head offload_base __read_mostly;
 154
 155 static int netif_rx_internal(struct sk_buff *skb);
 156 static int call_netdevice_notifiers_info(unsigned long val,
 157                                          struct net_device *dev,
 158                                          struct netdev_notifier_info *info);
 159
 160 /*
 161  * The @dev_base_head list is protected by @dev_base_lock and the rtnl
 162  * semaphore.
 163  *
 164  * Pure readers hold dev_base_lock for reading, or rcu_read_lock()
 165  *
 166  * Writers must hold the rtnl semaphore while they loop through the
 167  * dev_base_head list, and hold dev_base_lock for writing when they do the
 168  * actual updates.  This allows pure readers to access the list even
 169  * while a writer is preparing to update it.
 170  *
 171  * To put it another way, dev_base_lock is held for writing only to
 172  * protect against pure readers; the rtnl semaphore provides the
 173  * protection against other writers.
 174  *
 175  * See, for example usages, register_netdevice() and
 176  * unregister_netdevice(), which must be called with the rtnl
 177  * semaphore held.
 178  */
 179 DEFINE_RWLOCK(dev_base_lock);
 180 EXPORT_SYMBOL(dev_base_lock);
 181
 182 /* protects napi_hash addition/deletion and napi_gen_id */
 183 static DEFINE_SPINLOCK(napi_hash_lock);
 184
 185 static unsigned int napi_gen_id = NR_CPUS;
 186 static DEFINE_HASHTABLE(napi_hash, 8);
 187
 188 static seqcount_t devnet_rename_seq;
 189
 190 static inline void dev_base_seq_inc(struct net *net)
 191 {
 192         while (++net->dev_base_seq == 0);
 193 }
 194
 195 static inline struct hlist_head *dev_name_hash(struct net *net, const char *name)
 196 {
 197         unsigned int hash = full_name_hash(name, strnlen(name, IFNAMSIZ));
 198
 199         return &net->dev_name_head[hash_32(hash, NETDEV_HASHBITS)];
 200 }
 201
 202 static inline struct hlist_head *dev_index_hash(struct net *net, int ifindex)
 203 {
 204         return &net->dev_index_head[ifindex & (NETDEV_HASHENTRIES - 1)];
 205 }
 206
 207 static inline void rps_lock(struct softnet_data *sd)
 208 {
 209 #ifdef CONFIG_RPS
 210         spin_lock(&sd->input_pkt_queue.lock);
 211 #endif
 212 }
 213
 214 static inline void rps_unlock(struct softnet_data *sd)
 215 {
 216 #ifdef CONFIG_RPS
 217         spin_unlock(&sd->input_pkt_queue.lock);
 218 #endif
 219 }
 220
 221 /* Device list insertion */
 222 static void list_netdevice(struct net_device *dev)
 223 {
 224         struct net *net = dev_net(dev);
 225
 226         ASSERT_RTNL();
 227
 228         write_lock_bh(&dev_base_lock);
 229         list_add_tail_rcu(&dev->dev_list, &net->dev_base_head);
 230         hlist_add_head_rcu(&dev->name_hlist, dev_name_hash(net, dev->name));
 231         hlist_add_head_rcu(&dev->index_hlist,
 232                            dev_index_hash(net, dev->ifindex));
 233         write_unlock_bh(&dev_base_lock);
 234
 235         dev_base_seq_inc(net);
 236 }
 237
 238 /* Device list removal
 239  * caller must respect a RCU grace period before freeing/reusing dev
 240  */
 241 static void unlist_netdevice(struct net_device *dev)
 242 {
 243         ASSERT_RTNL();
 244
 245         /* Unlink dev from the device chain */
 246         write_lock_bh(&dev_base_lock);
 247         list_del_rcu(&dev->dev_list);
 248         hlist_del_rcu(&dev->name_hlist);
 249         hlist_del_rcu(&dev->index_hlist);
 250         write_unlock_bh(&dev_base_lock);
 251
 252         dev_base_seq_inc(dev_net(dev));
 253 }
 254
 255 /*
 256  *      Our notifier list
 257  */
 258
 259 static RAW_NOTIFIER_HEAD(netdev_chain);
 260
 261 /*
 262  *      Device drivers call our routines to queue packets here. We empty the
 263  *      queue in the local softnet handler.
 264  */
 265
 266 DEFINE_PER_CPU_ALIGNED(struct softnet_data, softnet_data);
 267 EXPORT_PER_CPU_SYMBOL(softnet_data);
 268
 269 #ifdef CONFIG_LOCKDEP
 270 /*
 271  * register_netdevice() inits txq->_xmit_lock and sets lockdep class
 272  * according to dev->type
 273  */
 274 static const unsigned short netdev_lock_type[] =
 275         {ARPHRD_NETROM, ARPHRD_ETHER, ARPHRD_EETHER, ARPHRD_AX25,
 276          ARPHRD_PRONET, ARPHRD_CHAOS, ARPHRD_IEEE802, ARPHRD_ARCNET,
 277          ARPHRD_APPLETLK, ARPHRD_DLCI, ARPHRD_ATM, ARPHRD_METRICOM,
 278          ARPHRD_IEEE1394, ARPHRD_EUI64, ARPHRD_INFINIBAND, ARPHRD_SLIP,
 279          ARPHRD_CSLIP, ARPHRD_SLIP6, ARPHRD_CSLIP6, ARPHRD_RSRVD,
 280          ARPHRD_ADAPT, ARPHRD_ROSE, ARPHRD_X25, ARPHRD_HWX25,
 281          ARPHRD_PPP, ARPHRD_CISCO, ARPHRD_LAPB, ARPHRD_DDCMP,
 282          ARPHRD_RAWHDLC, ARPHRD_TUNNEL, ARPHRD_TUNNEL6, ARPHRD_FRAD,
 283          ARPHRD_SKIP, ARPHRD_LOOPBACK, ARPHRD_LOCALTLK, ARPHRD_FDDI,
 284          ARPHRD_BIF, ARPHRD_SIT, ARPHRD_IPDDP, ARPHRD_IPGRE,
 285          ARPHRD_PIMREG, ARPHRD_HIPPI, ARPHRD_ASH, ARPHRD_ECONET,
 286          ARPHRD_IRDA, ARPHRD_FCPP, ARPHRD_FCAL, ARPHRD_FCPL,
 287          ARPHRD_FCFABRIC, ARPHRD_IEEE80211, ARPHRD_IEEE80211_PRISM,
 288          ARPHRD_IEEE80211_RADIOTAP, ARPHRD_PHONET, ARPHRD_PHONET_PIPE,
 289          ARPHRD_IEEE802154, ARPHRD_VOID, ARPHRD_NONE};
 290
 291 static const char *const netdev_lock_name[] =
 292         {"_xmit_NETROM", "_xmit_ETHER", "_xmit_EETHER", "_xmit_AX25",
 293          "_xmit_PRONET", "_xmit_CHAOS", "_xmit_IEEE802", "_xmit_ARCNET",
 294          "_xmit_APPLETLK", "_xmit_DLCI", "_xmit_ATM", "_xmit_METRICOM",
 295          "_xmit_IEEE1394", "_xmit_EUI64", "_xmit_INFINIBAND", "_xmit_SLIP",
 296          "_xmit_CSLIP", "_xmit_SLIP6", "_xmit_CSLIP6", "_xmit_RSRVD",
 297          "_xmit_ADAPT", "_xmit_ROSE", "_xmit_X25", "_xmit_HWX25",
 298          "_xmit_PPP", "_xmit_CISCO", "_xmit_LAPB", "_xmit_DDCMP",
 299          "_xmit_RAWHDLC", "_xmit_TUNNEL", "_xmit_TUNNEL6", "_xmit_FRAD",
 300          "_xmit_SKIP", "_xmit_LOOPBACK", "_xmit_LOCALTLK", "_xmit_FDDI",
 301          "_xmit_BIF", "_xmit_SIT", "_xmit_IPDDP", "_xmit_IPGRE",
 302          "_xmit_PIMREG", "_xmit_HIPPI", "_xmit_ASH", "_xmit_ECONET",
 303          "_xmit_IRDA", "_xmit_FCPP", "_xmit_FCAL", "_xmit_FCPL",
 304          "_xmit_FCFABRIC", "_xmit_IEEE80211", "_xmit_IEEE80211_PRISM",
 305          "_xmit_IEEE80211_RADIOTAP", "_xmit_PHONET", "_xmit_PHONET_PIPE",
 306          "_xmit_IEEE802154", "_xmit_VOID", "_xmit_NONE"};
 307
 308 static struct lock_class_key netdev_xmit_lock_key[ARRAY_SIZE(netdev_lock_type)];
 309 static struct lock_class_key netdev_addr_lock_key[ARRAY_SIZE(netdev_lock_type)];
 310
 311 static inline unsigned short netdev_lock_pos(unsigned short dev_type)
 312 {
 313         int i;
 314
 315         for (i = 0; i < ARRAY_SIZE(netdev_lock_type); i++)
 316                 if (netdev_lock_type[i] == dev_type)
 317                         return i;
 318         /* the last key is used by default */
 319         return ARRAY_SIZE(netdev_lock_type) - 1;
 320 }
 321
 322 static inline void netdev_set_xmit_lockdep_class(spinlock_t *lock,
 323                                                  unsigned short dev_type)
 324 {
 325         int i;
 326
 327         i = netdev_lock_pos(dev_type);
 328         lockdep_set_class_and_name(lock, &netdev_xmit_lock_key[i],
 329                                    netdev_lock_name[i]);
 330 }
 331
 332 static inline void netdev_set_addr_lockdep_class(struct net_device *dev)
 333 {
 334         int i;
 335
 336         i = netdev_lock_pos(dev->type);
 337         lockdep_set_class_and_name(&dev->addr_list_lock,
 338                                    &netdev_addr_lock_key[i],
 339                                    netdev_lock_name[i]);
 340 }
 341 #else
 342 static inline void netdev_set_xmit_lockdep_class(spinlock_t *lock,
 343                                                  unsigned short dev_type)
 344 {
 345 }
 346 static inline void netdev_set_addr_lockdep_class(struct net_device *dev)
 347 {
 348 }
 349 #endif
 350
 351 /*******************************************************************************
 352
 353                 Protocol management and registration routines
 354
 355 *******************************************************************************/
 356
 357 /*
 358  *      Add a protocol ID to the list. Now that the input handler is
 359  *      smarter we can dispense with all the messy stuff that used to be
 360  *      here.
 361  *
 362  *      BEWARE!!! Protocol handlers, mangling input packets,
 363  *      MUST BE last in hash buckets and checking protocol handlers
 364  *      MUST start from promiscuous ptype_all chain in net_bh.
 365  *      It is true now, do not change it.
 366  *      Explanation follows: if protocol handler, mangling packet, will
 367  *      be the first on list, it is not able to sense, that packet
 368  *      is cloned and should be copied-on-write, so that it will
 369  *      change it and subsequent readers will get broken packet.
 370  *                                                      --ANK (980803)
 371  */
 372
 373 static inline struct list_head *ptype_head(const struct packet_type *pt)
 374 {
 375         if (pt->type == htons(ETH_P_ALL))
 376                 return pt->dev ? &pt->dev->ptype_all : &ptype_all;
 377         else
 378                 return pt->dev ? &pt->dev->ptype_specific :
 379                                  &ptype_base[ntohs(pt->type) & PTYPE_HASH_MASK];
 380 }
 381
 382 /**
 383  *      dev_add_pack - add packet handler
 384  *      @pt: packet type declaration
 385  *
 386  *      Add a protocol handler to the networking stack. The passed &packet_type
 387  *      is linked into kernel lists and may not be freed until it has been
 388  *      removed from the kernel lists.
 389  *
 390  *      This call does not sleep therefore it can not
 391  *      guarantee all CPU's that are in middle of receiving packets
 392  *      will see the new packet type (until the next received packet).
 393  */
 394
 395 void dev_add_pack(struct packet_type *pt)
 396 {
 397         struct list_head *head = ptype_head(pt);
 398
 399         spin_lock(&ptype_lock);
 400         list_add_rcu(&pt->list, head);
 401         spin_unlock(&ptype_lock);
 402 }
 403 EXPORT_SYMBOL(dev_add_pack);
 404
 405 /**
 406  *      __dev_remove_pack        - remove packet handler
 407  *      @pt: packet type declaration
 408  *
 409  *      Remove a protocol handler that was previously added to the kernel
 410  *      protocol handlers by dev_add_pack(). The passed &packet_type is removed
 411  *      from the kernel lists and can be freed or reused once this function
 412  *      returns.
 413  *
 414  *      The packet type might still be in use by receivers
 415  *      and must not be freed until after all the CPU's have gone
 416  *      through a quiescent state.
 417  */
 418 void __dev_remove_pack(struct packet_type *pt)
 419 {
 420         struct list_head *head = ptype_head(pt);
 421         struct packet_type *pt1;
 422
 423         spin_lock(&ptype_lock);
 424
 425         list_for_each_entry(pt1, head, list) {
 426                 if (pt == pt1) {
 427                         list_del_rcu(&pt->list);
 428                         goto out;
 429                 }
 430         }
 431
 432         pr_warn("dev_remove_pack: %p not found\n", pt);
 433 out:
 434         spin_unlock(&ptype_lock);
 435 }
 436 EXPORT_SYMBOL(__dev_remove_pack);
 437
 438 /**
 439  *      dev_remove_pack  - remove packet handler
 440  *      @pt: packet type declaration
 441  *
 442  *      Remove a protocol handler that was previously added to the kernel
 443  *      protocol handlers by dev_add_pack(). The passed &packet_type is removed
 444  *      from the kernel lists and can be freed or reused once this function
 445  *      returns.
 446  *
 447  *      This call sleeps to guarantee that no CPU is looking at the packet
 448  *      type after return.
 449  */
 450 void dev_remove_pack(struct packet_type *pt)
 451 {
 452         __dev_remove_pack(pt);
 453
 454         synchronize_net();
 455 }
 456 EXPORT_SYMBOL(dev_remove_pack);
 457
 458
 459 /**
 460  *      dev_add_offload - register offload handlers
 461  *      @po: protocol offload declaration
 462  *
 463  *      Add protocol offload handlers to the networking stack. The passed
 464  *      &proto_offload is linked into kernel lists and may not be freed until
 465  *      it has been removed from the kernel lists.
 466  *
 467  *      This call does not sleep therefore it can not
 468  *      guarantee all CPU's that are in middle of receiving packets
 469  *      will see the new offload handlers (until the next received packet).
 470  */
 471 void dev_add_offload(struct packet_offload *po)
 472 {
 473         struct packet_offload *elem;
 474
 475         spin_lock(&offload_lock);
 476         list_for_each_entry(elem, &offload_base, list) {
 477                 if (po->priority < elem->priority)
 478                         break;
 479         }
 480         list_add_rcu(&po->list, elem->list.prev);
 481         spin_unlock(&offload_lock);
 482 }
 483 EXPORT_SYMBOL(dev_add_offload);
 484
 485 /**
 486  *      __dev_remove_offload     - remove offload handler
 487  *      @po: packet offload declaration
 488  *
 489  *      Remove a protocol offload handler that was previously added to the
 490  *      kernel offload handlers by dev_add_offload(). The passed &offload_type
 491  *      is removed from the kernel lists and can be freed or reused once this
 492  *      function returns.
 493  *
 494  *      The packet type might still be in use by receivers
 495  *      and must not be freed until after all the CPU's have gone
 496  *      through a quiescent state.
 497  */
 498 static void __dev_remove_offload(struct packet_offload *po)
 499 {
 500         struct list_head *head = &offload_base;
 501         struct packet_offload *po1;
 502
 503         spin_lock(&offload_lock);
 504
 505         list_for_each_entry(po1, head, list) {
 506                 if (po == po1) {
 507                         list_del_rcu(&po->list);
 508                         goto out;
 509                 }
 510         }
 511
 512         pr_warn("dev_remove_offload: %p not found\n", po);
 513 out:
 514         spin_unlock(&offload_lock);
 515 }
 516
 517 /**
 518  *      dev_remove_offload       - remove packet offload handler
 519  *      @po: packet offload declaration
 520  *
 521  *      Remove a packet offload handler that was previously added to the kernel
 522  *      offload handlers by dev_add_offload(). The passed &offload_type is
 523  *      removed from the kernel lists and can be freed or reused once this
 524  *      function returns.
 525  *
 526  *      This call sleeps to guarantee that no CPU is looking at the packet
 527  *      type after return.
 528  */
 529 void dev_remove_offload(struct packet_offload *po)
 530 {
 531         __dev_remove_offload(po);
 532
 533         synchronize_net();
 534 }
 535 EXPORT_SYMBOL(dev_remove_offload);
 536
 537 /******************************************************************************
 538
 539                       Device Boot-time Settings Routines
 540
 541 *******************************************************************************/
 542
 543 /* Boot time configuration table */
 544 static struct netdev_boot_setup dev_boot_setup[NETDEV_BOOT_SETUP_MAX];
 545
 546 /**
 547  *      netdev_boot_setup_add   - add new setup entry
 548  *      @name: name of the device
 549  *      @map: configured settings for the device
 550  *
 551  *      Adds new setup entry to the dev_boot_setup list.  The function
 552  *      returns 0 on error and 1 on success.  This is a generic routine to
 553  *      all netdevices.
 554  */
 555 static int netdev_boot_setup_add(char *name, struct ifmap *map)
 556 {
 557         struct netdev_boot_setup *s;
 558         int i;
 559
 560         s = dev_boot_setup;
 561         for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++) {
 562                 if (s[i].name[0] == '\0' || s[i].name[0] == ' ') {
 563                         memset(s[i].name, 0, sizeof(s[i].name));
 564                         strlcpy(s[i].name, name, IFNAMSIZ);
 565                         memcpy(&s[i].map, map, sizeof(s[i].map));
 566                         break;
 567                 }
 568         }
 569
 570         return i >= NETDEV_BOOT_SETUP_MAX ? 0 : 1;
 571 }
 572
 573 /**
 574  *      netdev_boot_setup_check - check boot time settings
 575  *      @dev: the netdevice
 576  *
 577  *      Check boot time settings for the device.
 578  *      The found settings are set for the device to be used
 579  *      later in the device probing.
 580  *      Returns 0 if no settings found, 1 if they are.
 581  */
 582 int netdev_boot_setup_check(struct net_device *dev)
 583 {
 584         struct netdev_boot_setup *s = dev_boot_setup;
 585         int i;
 586
 587         for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++) {
 588                 if (s[i].name[0] != '\0' && s[i].name[0] != ' ' &&
 589                     !strcmp(dev->name, s[i].name)) {
 590                         dev->irq        = s[i].map.irq;
 591                         dev->base_addr  = s[i].map.base_addr;
 592                         dev->mem_start  = s[i].map.mem_start;
 593                         dev->mem_end    = s[i].map.mem_end;
 594                         return 1;
 595                 }
 596         }
 597         return 0;
 598 }
 599 EXPORT_SYMBOL(netdev_boot_setup_check);
 600
 601
 602 /**
 603  *      netdev_boot_base        - get address from boot time settings
 604  *      @prefix: prefix for network device
 605  *      @unit: id for network device
 606  *
 607  *      Check boot time settings for the base address of device.
 608  *      The found settings are set for the device to be used
 609  *      later in the device probing.
 610  *      Returns 0 if no settings found.
 611  */
 612 unsigned long netdev_boot_base(const char *prefix, int unit)
 613 {
 614         const struct netdev_boot_setup *s = dev_boot_setup;
 615         char name[IFNAMSIZ];
 616         int i;
 617
 618         sprintf(name, "%s%d", prefix, unit);
 619
 620         /*
 621          * If device already registered then return base of 1
 622          * to indicate not to probe for this interface
 623          */
 624         if (__dev_get_by_name(&init_net, name))
 625                 return 1;
 626
 627         for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++)
 628                 if (!strcmp(name, s[i].name))
 629                         return s[i].map.base_addr;
 630         return 0;
 631 }
 632
 633 /*
 634  * Saves at boot time configured settings for any netdevice.
 635  */
 636 int __init netdev_boot_setup(char *str)
 637 {
 638         int ints[5];
 639         struct ifmap map;
 640
 641         str = get_options(str, ARRAY_SIZE(ints), ints);
 642         if (!str || !*str)
 643                 return 0;
 644
 645         /* Save settings */
 646         memset(&map, 0, sizeof(map));
 647         if (ints[0] > 0)
 648                 map.irq = ints[1];
 649         if (ints[0] > 1)
 650                 map.base_addr = ints[2];
 651         if (ints[0] > 2)
 652                 map.mem_start = ints[3];
 653         if (ints[0] > 3)
 654                 map.mem_end = ints[4];
 655
 656         /* Add new entry to the list */
 657         return netdev_boot_setup_add(str, &map);
 658 }
 659
 660 __setup("netdev=", netdev_boot_setup);
 661
 662 /*******************************************************************************
 663
 664                             Device Interface Subroutines
 665
 666 *******************************************************************************/
 667
 668 /**
 669  *      dev_get_iflink  - get 'iflink' value of a interface
 670  *      @dev: targeted interface
 671  *
 672  *      Indicates the ifindex the interface is linked to.
 673  *      Physical interfaces have the same 'ifindex' and 'iflink' values.
 674  */
 675
 676 int dev_get_iflink(const struct net_device *dev)
 677 {
 678         if (dev->netdev_ops && dev->netdev_ops->ndo_get_iflink)
 679                 return dev->netdev_ops->ndo_get_iflink(dev);
 680
 681         return dev->ifindex;
 682 }
 683 EXPORT_SYMBOL(dev_get_iflink);
 684
 685 /**
 686  *      dev_fill_metadata_dst - Retrieve tunnel egress information.
 687  *      @dev: targeted interface
 688  *      @skb: The packet.
 689  *
 690  *      For better visibility of tunnel traffic OVS needs to retrieve
 691  *      egress tunnel information for a packet. Following API allows
 692  *      user to get this info.
 693  */
 694 int dev_fill_metadata_dst(struct net_device *dev, struct sk_buff *skb)
 695 {
 696         struct ip_tunnel_info *info;
 697
 698         if (!dev->netdev_ops  || !dev->netdev_ops->ndo_fill_metadata_dst)
 699                 return -EINVAL;
 700
 701         info = skb_tunnel_info_unclone(skb);
 702         if (!info)
 703                 return -ENOMEM;
 704         if (unlikely(!(info->mode & IP_TUNNEL_INFO_TX)))
 705                 return -EINVAL;
 706
 707         return dev->netdev_ops->ndo_fill_metadata_dst(dev, skb);
 708 }
 709 EXPORT_SYMBOL_GPL(dev_fill_metadata_dst);
 710
 711 /**
 712  *      __dev_get_by_name       - find a device by its name
 713  *      @net: the applicable net namespace
 714  *      @name: name to find
 715  *
 716  *      Find an interface by name. Must be called under RTNL semaphore
 717  *      or @dev_base_lock. If the name is found a pointer to the device
 718  *      is returned. If the name is not found then %NULL is returned. The
 719  *      reference counters are not incremented so the caller must be
 720  *      careful with locks.
 721  */
 722
 723 struct net_device *__dev_get_by_name(struct net *net, const char *name)
 724 {
 725         struct net_device *dev;
 726         struct hlist_head *head = dev_name_hash(net, name);
 727
 728         hlist_for_each_entry(dev, head, name_hlist)
 729                 if (!strncmp(dev->name, name, IFNAMSIZ))
 730                         return dev;
 731
 732         return NULL;
 733 }
 734 EXPORT_SYMBOL(__dev_get_by_name);
 735
 736 /**
 737  *      dev_get_by_name_rcu     - find a device by its name
 738  *      @net: the applicable net namespace
 739  *      @name: name to find
 740  *
 741  *      Find an interface by name.
 742  *      If the name is found a pointer to the device is returned.
 743  *      If the name is not found then %NULL is returned.
 744  *      The reference counters are not incremented so the caller must be
 745  *      careful with locks. The caller must hold RCU lock.
 746  */
 747
 748 struct net_device *dev_get_by_name_rcu(struct net *net, const char *name)
 749 {
 750         struct net_device *dev;
 751         struct hlist_head *head = dev_name_hash(net, name);
 752
 753         hlist_for_each_entry_rcu(dev, head, name_hlist)
 754                 if (!strncmp(dev->name, name, IFNAMSIZ))
 755                         return dev;
 756
 757         return NULL;
 758 }
 759 EXPORT_SYMBOL(dev_get_by_name_rcu);
 760
 761 /**
 762  *      dev_get_by_name         - find a device by its name
 763  *      @net: the applicable net namespace
 764  *      @name: name to find
 765  *
 766  *      Find an interface by name. This can be called from any
 767  *      context and does its own locking. The returned handle has
 768  *      the usage count incremented and the caller must use dev_put() to
 769  *      release it when it is no longer needed. %NULL is returned if no
 770  *      matching device is found.
 771  */
 772
 773 struct net_device *dev_get_by_name(struct net *net, const char *name)
 774 {
 775         struct net_device *dev;
 776
 777         rcu_read_lock();
 778         dev = dev_get_by_name_rcu(net, name);
 779         if (dev)
 780                 dev_hold(dev);
 781         rcu_read_unlock();
 782         return dev;
 783 }
 784 EXPORT_SYMBOL(dev_get_by_name);
 785
 786 /**
 787  *      __dev_get_by_index - find a device by its ifindex
 788  *      @net: the applicable net namespace
 789  *      @ifindex: index of device
 790  *
 791  *      Search for an interface by index. Returns %NULL if the device
 792  *      is not found or a pointer to the device. The device has not
 793  *      had its reference counter increased so the caller must be careful
 794  *      about locking. The caller must hold either the RTNL semaphore
 795  *      or @dev_base_lock.
 796  */
 797
 798 struct net_device *__dev_get_by_index(struct net *net, int ifindex)
 799 {
 800         struct net_device *dev;
 801         struct hlist_head *head = dev_index_hash(net, ifindex);
 802
 803         hlist_for_each_entry(dev, head, index_hlist)
 804                 if (dev->ifindex == ifindex)
 805                         return dev;
 806
 807         return NULL;
 808 }
 809 EXPORT_SYMBOL(__dev_get_by_index);
 810
 811 /**
 812  *      dev_get_by_index_rcu - find a device by its ifindex
 813  *      @net: the applicable net namespace
 814  *      @ifindex: index of device
 815  *
 816  *      Search for an interface by index. Returns %NULL if the device
 817  *      is not found or a pointer to the device. The device has not
 818  *      had its reference counter increased so the caller must be careful
 819  *      about locking. The caller must hold RCU lock.
 820  */
 821
 822 struct net_device *dev_get_by_index_rcu(struct net *net, int ifindex)
 823 {
 824         struct net_device *dev;
 825         struct hlist_head *head = dev_index_hash(net, ifindex);
 826
 827         hlist_for_each_entry_rcu(dev, head, index_hlist)
 828                 if (dev->ifindex == ifindex)
 829                         return dev;
 830
 831         return NULL;
 832 }
 833 EXPORT_SYMBOL(dev_get_by_index_rcu);
 834
 835
 836 /**
 837  *      dev_get_by_index - find a device by its ifindex
 838  *      @net: the applicable net namespace
 839  *      @ifindex: index of device
 840  *
 841  *      Search for an interface by index. Returns NULL if the device
 842  *      is not found or a pointer to the device. The device returned has
 843  *      had a reference added and the pointer is safe until the user calls
 844  *      dev_put to indicate they have finished with it.
 845  */
 846
 847 struct net_device *dev_get_by_index(struct net *net, int ifindex)
 848 {
 849         struct net_device *dev;
 850
 851         rcu_read_lock();
 852         dev = dev_get_by_index_rcu(net, ifindex);
 853         if (dev)
 854                 dev_hold(dev);
 855         rcu_read_unlock();
 856         return dev;
 857 }
 858 EXPORT_SYMBOL(dev_get_by_index);
 859
 860 /**
 861  *      netdev_get_name - get a netdevice name, knowing its ifindex.
 862  *      @net: network namespace
 863  *      @name: a pointer to the buffer where the name will be stored.
 864  *      @ifindex: the ifindex of the interface to get the name from.
 865  *
 866  *      The use of raw_seqcount_begin() and cond_resched() before
 867  *      retrying is required as we want to give the writers a chance
 868  *      to complete when CONFIG_PREEMPT is not set.
 869  */
 870 int netdev_get_name(struct net *net, char *name, int ifindex)
 871 {
 872         struct net_device *dev;
 873         unsigned int seq;
 874
 875 retry:
 876         seq = raw_seqcount_begin(&devnet_rename_seq);
 877         rcu_read_lock();
 878         dev = dev_get_by_index_rcu(net, ifindex);
 879         if (!dev) {
 880                 rcu_read_unlock();
 881                 return -ENODEV;
 882         }
 883
 884         strcpy(name, dev->name);
 885         rcu_read_unlock();
 886         if (read_seqcount_retry(&devnet_rename_seq, seq)) {
 887                 cond_resched();
 888                 goto retry;
 889         }
 890
 891         return 0;
 892 }
 893
 894 /**
 895  *      dev_getbyhwaddr_rcu - find a device by its hardware address
 896  *      @net: the applicable net namespace
 897  *      @type: media type of device
 898  *      @ha: hardware address
 899  *
 900  *      Search for an interface by MAC address. Returns NULL if the device
 901  *      is not found or a pointer to the device.
 902  *      The caller must hold RCU or RTNL.
 903  *      The returned device has not had its ref count increased
 904  *      and the caller must therefore be careful about locking
 905  *
 906  */
 907
 908 struct net_device *dev_getbyhwaddr_rcu(struct net *net, unsigned short type,
 909                                        const char *ha)
 910 {
 911         struct net_device *dev;
 912
 913         for_each_netdev_rcu(net, dev)
 914                 if (dev->type == type &&
 915                     !memcmp(dev->dev_addr, ha, dev->addr_len))
 916                         return dev;
 917
 918         return NULL;
 919 }
 920 EXPORT_SYMBOL(dev_getbyhwaddr_rcu);
 921
 922 struct net_device *__dev_getfirstbyhwtype(struct net *net, unsigned short type)
 923 {
 924         struct net_device *dev;
 925
 926         ASSERT_RTNL();
 927         for_each_netdev(net, dev)
 928                 if (dev->type == type)
 929                         return dev;
 930
 931         return NULL;
 932 }
 933 EXPORT_SYMBOL(__dev_getfirstbyhwtype);
 934
 935 struct net_device *dev_getfirstbyhwtype(struct net *net, unsigned short type)
 936 {
 937         struct net_device *dev, *ret = NULL;
 938
 939         rcu_read_lock();
 940         for_each_netdev_rcu(net, dev)
 941                 if (dev->type == type) {
 942                         dev_hold(dev);
 943                         ret = dev;
 944                         break;
 945                 }
 946         rcu_read_unlock();
 947         return ret;
 948 }
 949 EXPORT_SYMBOL(dev_getfirstbyhwtype);
 950
 951 /**
 952  *      __dev_get_by_flags - find any device with given flags
 953  *      @net: the applicable net namespace
 954  *      @if_flags: IFF_* values
 955  *      @mask: bitmask of bits in if_flags to check
 956  *
 957  *      Search for any interface with the given flags. Returns NULL if a device
 958  *      is not found or a pointer to the device. Must be called inside
 959  *      rtnl_lock(), and result refcount is unchanged.
 960  */
 961
 962 struct net_device *__dev_get_by_flags(struct net *net, unsigned short if_flags,
 963                                       unsigned short mask)
 964 {
 965         struct net_device *dev, *ret;
 966
 967         ASSERT_RTNL();
 968
 969         ret = NULL;
 970         for_each_netdev(net, dev) {
 971                 if (((dev->flags ^ if_flags) & mask) == 0) {
 972                         ret = dev;
 973                         break;
 974                 }
 975         }
 976         return ret;
 977 }
 978 EXPORT_SYMBOL(__dev_get_by_flags);
 979
 980 /**
 981  *      dev_valid_name - check if name is okay for network device
 982  *      @name: name string
 983  *
 984  *      Network device names need to be valid file names to
 985  *      to allow sysfs to work.  We also disallow any kind of
 986  *      whitespace.
 987  */
 988 bool dev_valid_name(const char *name)
 989 {
 990         if (*name == '\0')
 991                 return false;
 992         if (strnlen(name, IFNAMSIZ) == IFNAMSIZ)
 993                 return false;
 994         if (!strcmp(name, ".") || !strcmp(name, ".."))
 995                 return false;
 996
 997         while (*name) {
 998                 if (*name == '/' || *name == ':' || isspace(*name))
 999                         return false;
1000                 name++;
1001         }
1002         return true;
1003 }
1004 EXPORT_SYMBOL(dev_valid_name);
1005
1006 /**
1007  *      __dev_alloc_name - allocate a name for a device
1008  *      @net: network namespace to allocate the device name in
1009  *      @name: name format string
1010  *      @buf:  scratch buffer and result name string
1011  *
1012  *      Passed a format string - eg "lt%d" it will try and find a suitable
1013  *      id. It scans list of devices to build up a free map, then chooses
1014  *      the first empty slot. The caller must hold the dev_base or rtnl lock
1015  *      while allocating the name and adding the device in order to avoid
1016  *      duplicates.
1017  *      Limited to bits_per_byte * page size devices (ie 32K on most platforms).
1018  *      Returns the number of the unit assigned or a negative errno code.
1019  */
1020
1021 static int __dev_alloc_name(struct net *net, const char *name, char *buf)
1022 {
1023         int i = 0;
1024         const char *p;
1025         const int max_netdevices = 8*PAGE_SIZE;
1026         unsigned long *inuse;
1027         struct net_device *d;
1028
1029         p = strnchr(name, IFNAMSIZ-1, '%');
1030         if (p) {
1031                 /*
1032                  * Verify the string as this thing may have come from
1033                  * the user.  There must be either one "%d" and no other "%"
1034                  * characters.
1035                  */
1036                 if (p[1] != 'd' || strchr(p + 2, '%'))
1037                         return -EINVAL;
1038
1039                 /* Use one page as a bit array of possible slots */
1040                 inuse = (unsigned long *) get_zeroed_page(GFP_ATOMIC);
1041                 if (!inuse)
1042                         return -ENOMEM;
1043
1044                 for_each_netdev(net, d) {
1045                         if (!sscanf(d->name, name, &i))
1046                                 continue;
1047                         if (i < 0 || i >= max_netdevices)
1048                                 continue;
1049
1050                         /*  avoid cases where sscanf is not exact inverse of printf */
1051                         snprintf(buf, IFNAMSIZ, name, i);
1052                         if (!strncmp(buf, d->name, IFNAMSIZ))
1053                                 set_bit(i, inuse);
1054                 }
1055
1056                 i = find_first_zero_bit(inuse, max_netdevices);
1057                 free_page((unsigned long) inuse);
1058         }
1059
1060         if (buf != name)
1061                 snprintf(buf, IFNAMSIZ, name, i);
1062         if (!__dev_get_by_name(net, buf))
1063                 return i;
1064
1065         /* It is possible to run out of possible slots
1066          * when the name is long and there isn't enough space left
1067          * for the digits, or if all bits are used.
1068          */
1069         return -ENFILE;
1070 }
1071
1072 /**
1073  *      dev_alloc_name - allocate a name for a device
1074  *      @dev: device
1075  *      @name: name format string
1076  *
1077  *      Passed a format string - eg "lt%d" it will try and find a suitable
1078  *      id. It scans list of devices to build up a free map, then chooses
1079  *      the first empty slot. The caller must hold the dev_base or rtnl lock
1080  *      while allocating the name and adding the device in order to avoid
1081  *      duplicates.
1082  *      Limited to bits_per_byte * page size devices (ie 32K on most platforms).
1083  *      Returns the number of the unit assigned or a negative errno code.
1084  */
1085
1086 int dev_alloc_name(struct net_device *dev, const char *name)
1087 {
1088         char buf[IFNAMSIZ];
1089         struct net *net;
1090         int ret;
1091
1092         BUG_ON(!dev_net(dev));
1093         net = dev_net(dev);
1094         ret = __dev_alloc_name(net, name, buf);
1095         if (ret >= 0)
1096                 strlcpy(dev->name, buf, IFNAMSIZ);
1097         return ret;
1098 }
1099 EXPORT_SYMBOL(dev_alloc_name);
1100
1101 static int dev_alloc_name_ns(struct net *net,
1102                              struct net_device *dev,
1103                              const char *name)
1104 {
1105         char buf[IFNAMSIZ];
1106         int ret;
1107
1108         ret = __dev_alloc_name(net, name, buf);
1109         if (ret >= 0)
1110                 strlcpy(dev->name, buf, IFNAMSIZ);
1111         return ret;
1112 }
1113
1114 int dev_get_valid_name(struct net *net, struct net_device *dev,
1115                        const char *name)
1116 {
1117         BUG_ON(!net);
1118
1119         if (!dev_valid_name(name))
1120                 return -EINVAL;
1121
1122         if (strchr(name, '%'))
1123                 return dev_alloc_name_ns(net, dev, name);
1124         else if (__dev_get_by_name(net, name))
1125                 return -EEXIST;
1126         else if (dev->name != name)
1127                 strlcpy(dev->name, name, IFNAMSIZ);
1128
1129         return 0;
1130 }
1131 EXPORT_SYMBOL(dev_get_valid_name);
1132
1133 /**
1134  *      dev_change_name - change name of a device
1135  *      @dev: device
1136  *      @newname: name (or format string) must be at least IFNAMSIZ
1137  *
1138  *      Change name of a device, can pass format strings "eth%d".
1139  *      for wildcarding.
1140  */
1141 int dev_change_name(struct net_device *dev, const char *newname)
1142 {
1143         unsigned char old_assign_type;
1144         char oldname[IFNAMSIZ];
1145         int err = 0;
1146         int ret;
1147         struct net *net;
1148
1149         ASSERT_RTNL();
1150         BUG_ON(!dev_net(dev));
1151
1152         net = dev_net(dev);
1153         if (dev->flags & IFF_UP)
1154                 return -EBUSY;
1155
1156         write_seqcount_begin(&devnet_rename_seq);
1157
1158         if (strncmp(newname, dev->name, IFNAMSIZ) == 0) {
1159                 write_seqcount_end(&devnet_rename_seq);
1160                 return 0;
1161         }
1162
1163         memcpy(oldname, dev->name, IFNAMSIZ);
1164
1165         err = dev_get_valid_name(net, dev, newname);
1166         if (err < 0) {
1167                 write_seqcount_end(&devnet_rename_seq);
1168                 return err;
1169         }
1170
1171         if (oldname[0] && !strchr(oldname, '%'))
1172                 netdev_info(dev, "renamed from %s\n", oldname);
1173
1174         old_assign_type = dev->name_assign_type;
1175         dev->name_assign_type = NET_NAME_RENAMED;
1176
1177 rollback:
1178         ret = device_rename(&dev->dev, dev->name);
1179         if (ret) {
1180                 memcpy(dev->name, oldname, IFNAMSIZ);
1181                 dev->name_assign_type = old_assign_type;
1182                 write_seqcount_end(&devnet_rename_seq);
1183                 return ret;
1184         }
1185
1186         write_seqcount_end(&devnet_rename_seq);
1187
1188         netdev_adjacent_rename_links(dev, oldname);
1189
1190         write_lock_bh(&dev_base_lock);
1191         hlist_del_rcu(&dev->name_hlist);
1192         write_unlock_bh(&dev_base_lock);
1193
1194         synchronize_rcu();
1195
1196         write_lock_bh(&dev_base_lock);
1197         hlist_add_head_rcu(&dev->name_hlist, dev_name_hash(net, dev->name));
1198         write_unlock_bh(&dev_base_lock);
1199
1200         ret = call_netdevice_notifiers(NETDEV_CHANGENAME, dev);
1201         ret = notifier_to_errno(ret);
1202
1203         if (ret) {
1204                 /* err >= 0 after dev_alloc_name() or stores the first errno */
1205                 if (err >= 0) {
1206                         err = ret;
1207                         write_seqcount_begin(&devnet_rename_seq);
1208                         memcpy(dev->name, oldname, IFNAMSIZ);
1209                         memcpy(oldname, newname, IFNAMSIZ);
1210                         dev->name_assign_type = old_assign_type;
1211                         old_assign_type = NET_NAME_RENAMED;
1212                         goto rollback;
1213                 } else {
1214                         pr_err("%s: name change rollback failed: %d\n",
1215                                dev->name, ret);
1216                 }
1217         }
1218
1219         return err;
1220 }
1221
1222 /**
1223  *      dev_set_alias - change ifalias of a device
1224  *      @dev: device
1225  *      @alias: name up to IFALIASZ
1226  *      @len: limit of bytes to copy from info
1227  *
1228  *      Set ifalias for a device,
1229  */
1230 int dev_set_alias(struct net_device *dev, const char *alias, size_t len)
1231 {
1232         char *new_ifalias;
1233
1234         ASSERT_RTNL();
1235
1236         if (len >= IFALIASZ)
1237                 return -EINVAL;
1238
1239         if (!len) {
1240                 kfree(dev->ifalias);
1241                 dev->ifalias = NULL;
1242                 return 0;
1243         }
1244
1245         new_ifalias = krealloc(dev->ifalias, len + 1, GFP_KERNEL);
1246         if (!new_ifalias)
1247                 return -ENOMEM;
1248         dev->ifalias = new_ifalias;
1249         memcpy(dev->ifalias, alias, len);
1250         dev->ifalias[len] = 0;
1251
1252         return len;
1253 }
1254
1255
1256 /**
1257  *      netdev_features_change - device changes features
1258  *      @dev: device to cause notification
1259  *
1260  *      Called to indicate a device has changed features.
1261  */
1262 void netdev_features_change(struct net_device *dev)
1263 {
1264         call_netdevice_notifiers(NETDEV_FEAT_CHANGE, dev);
1265 }
1266 EXPORT_SYMBOL(netdev_features_change);
1267
1268 /**
1269  *      netdev_state_change - device changes state
1270  *      @dev: device to cause notification
1271  *
1272  *      Called to indicate a device has changed state. This function calls
1273  *      the notifier chains for netdev_chain and sends a NEWLINK message
1274  *      to the routing socket.
1275  */
1276 void netdev_state_change(struct net_device *dev)
1277 {
1278         if (dev->flags & IFF_UP) {
1279                 struct netdev_notifier_change_info change_info;
1280
1281                 change_info.flags_changed = 0;
1282                 call_netdevice_notifiers_info(NETDEV_CHANGE, dev,
1283                                               &change_info.info);
1284                 rtmsg_ifinfo(RTM_NEWLINK, dev, 0, GFP_KERNEL);
1285         }
1286 }
1287 EXPORT_SYMBOL(netdev_state_change);
1288
1289 /**
1290  *      netdev_notify_peers - notify network peers about existence of @dev
1291  *      @dev: network device
1292  *
1293  * Generate traffic such that interested network peers are aware of
1294  * @dev, such as by generating a gratuitous ARP. This may be used when
1295  * a device wants to inform the rest of the network about some sort of
1296  * reconfiguration such as a failover event or virtual machine
1297  * migration.
1298  */
1299 void netdev_notify_peers(struct net_device *dev)
1300 {
1301         rtnl_lock();
1302         call_netdevice_notifiers(NETDEV_NOTIFY_PEERS, dev);
1303         call_netdevice_notifiers(NETDEV_RESEND_IGMP, dev);
1304         rtnl_unlock();
1305 }
1306 EXPORT_SYMBOL(netdev_notify_peers);
1307
1308 static int __dev_open(struct net_device *dev)
1309 {
1310         const struct net_device_ops *ops = dev->netdev_ops;
1311         int ret;
1312
1313         ASSERT_RTNL();
1314
1315         if (!netif_device_present(dev))
1316                 return -ENODEV;
1317
1318         /* Block netpoll from trying to do any rx path servicing.
1319          * If we don't do this there is a chance ndo_poll_controller
1320          * or ndo_poll may be running while we open the device
1321          */
1322         netpoll_poll_disable(dev);
1323
1324         ret = call_netdevice_notifiers(NETDEV_PRE_UP, dev);
1325         ret = notifier_to_errno(ret);
1326         if (ret)
1327                 return ret;
1328
1329         set_bit(__LINK_STATE_START, &dev->state);
1330
1331         if (ops->ndo_validate_addr)
1332                 ret = ops->ndo_validate_addr(dev);
1333
1334         if (!ret && ops->ndo_open)
1335                 ret = ops->ndo_open(dev);
1336
1337         netpoll_poll_enable(dev);
1338
1339         if (ret)
1340                 clear_bit(__LINK_STATE_START, &dev->state);
1341         else {
1342                 dev->flags |= IFF_UP;
1343                 dev_set_rx_mode(dev);
1344                 dev_activate(dev);
1345                 add_device_randomness(dev->dev_addr, dev->addr_len);
1346         }
1347
1348         return ret;
1349 }
1350
1351 /**
1352  *      dev_open        - prepare an interface for use.
1353  *      @dev:   device to open
1354  *
1355  *      Takes a device from down to up state. The device's private open
1356  *      function is invoked and then the multicast lists are loaded. Finally
1357  *      the device is moved into the up state and a %NETDEV_UP message is
1358  *      sent to the netdev notifier chain.
1359  *
1360  *      Calling this function on an active interface is a nop. On a failure
1361  *      a negative errno code is returned.
1362  */
1363 int dev_open(struct net_device *dev)
1364 {
1365         int ret;
1366
1367         if (dev->flags & IFF_UP)
1368                 return 0;
1369
1370         ret = __dev_open(dev);
1371         if (ret < 0)
1372                 return ret;
1373
1374         rtmsg_ifinfo(RTM_NEWLINK, dev, IFF_UP|IFF_RUNNING, GFP_KERNEL);
1375         call_netdevice_notifiers(NETDEV_UP, dev);
1376
1377         return ret;
1378 }
1379 EXPORT_SYMBOL(dev_open);
1380
1381 static int __dev_close_many(struct list_head *head)
1382 {
1383         struct net_device *dev;
1384
1385         ASSERT_RTNL();
1386         might_sleep();
1387
1388         list_for_each_entry(dev, head, close_list) {
1389                 /* Temporarily disable netpoll until the interface is down */
1390                 netpoll_poll_disable(dev);
1391
1392                 call_netdevice_notifiers(NETDEV_GOING_DOWN, dev);
1393
1394                 clear_bit(__LINK_STATE_START, &dev->state);
1395
1396                 /* Synchronize to scheduled poll. We cannot touch poll list, it
1397                  * can be even on different cpu. So just clear netif_running().
1398                  *
1399                  * dev->stop() will invoke napi_disable() on all of it's
1400                  * napi_struct instances on this device.
1401                  */
1402                 smp_mb__after_atomic(); /* Commit netif_running(). */
1403         }
1404
1405         dev_deactivate_many(head);
1406
1407         list_for_each_entry(dev, head, close_list) {
1408                 const struct net_device_ops *ops = dev->netdev_ops;
1409
1410                 /*
1411                  *      Call the device specific close. This cannot fail.
1412                  *      Only if device is UP
1413                  *
1414                  *      We allow it to be called even after a DETACH hot-plug
1415                  *      event.
1416                  */
1417                 if (ops->ndo_stop)
1418                         ops->ndo_stop(dev);
1419
1420                 dev->flags &= ~IFF_UP;
1421                 netpoll_poll_enable(dev);
1422         }
1423
1424         return 0;
1425 }
1426
1427 static int __dev_close(struct net_device *dev)
1428 {
1429         int retval;
1430         LIST_HEAD(single);
1431
1432         list_add(&dev->close_list, &single);
1433         retval = __dev_close_many(&single);
1434         list_del(&single);
1435
1436         return retval;
1437 }
1438
1439 int dev_close_many(struct list_head *head, bool unlink)
1440 {
1441         struct net_device *dev, *tmp;
1442
1443         /* Remove the devices that don't need to be closed */
1444         list_for_each_entry_safe(dev, tmp, head, close_list)
1445                 if (!(dev->flags & IFF_UP))
1446                         list_del_init(&dev->close_list);
1447
1448         __dev_close_many(head);
1449
1450         list_for_each_entry_safe(dev, tmp, head, close_list) {
1451                 rtmsg_ifinfo(RTM_NEWLINK, dev, IFF_UP|IFF_RUNNING, GFP_KERNEL);
1452                 call_netdevice_notifiers(NETDEV_DOWN, dev);
1453                 if (unlink)
1454                         list_del_init(&dev->close_list);
1455         }
1456
1457         return 0;
1458 }
1459 EXPORT_SYMBOL(dev_close_many);
1460
1461 /**
1462  *      dev_close - shutdown an interface.
1463  *      @dev: device to shutdown
1464  *
1465  *      This function moves an active device into down state. A
1466  *      %NETDEV_GOING_DOWN is sent to the netdev notifier chain. The device
1467  *      is then deactivated and finally a %NETDEV_DOWN is sent to the notifier
1468  *      chain.
1469  */
1470 int dev_close(struct net_device *dev)
1471 {
1472         if (dev->flags & IFF_UP) {
1473                 LIST_HEAD(single);
1474
1475                 list_add(&dev->close_list, &single);
1476                 dev_close_many(&single, true);
1477                 list_del(&single);
1478         }
1479         return 0;
1480 }
1481 EXPORT_SYMBOL(dev_close);
1482
1483
1484 /**
1485  *      dev_disable_lro - disable Large Receive Offload on a device
1486  *      @dev: device
1487  *
1488  *      Disable Large Receive Offload (LRO) on a net device.  Must be
1489  *      called under RTNL.  This is needed if received packets may be
1490  *      forwarded to another interface.
1491  */
1492 void dev_disable_lro(struct net_device *dev)
1493 {
1494         struct net_device *lower_dev;
1495         struct list_head *iter;
1496
1497         dev->wanted_features &= ~NETIF_F_LRO;
1498         netdev_update_features(dev);
1499
1500         if (unlikely(dev->features & NETIF_F_LRO))
1501                 netdev_WARN(dev, "failed to disable LRO!\n");
1502
1503         netdev_for_each_lower_dev(dev, lower_dev, iter)
1504                 dev_disable_lro(lower_dev);
1505 }
1506 EXPORT_SYMBOL(dev_disable_lro);
1507
1508 static int call_netdevice_notifier(struct notifier_block *nb, unsigned long val,
1509                                    struct net_device *dev)
1510 {
1511         struct netdev_notifier_info info;
1512
1513         netdev_notifier_info_init(&info, dev);
1514         return nb->notifier_call(nb, val, &info);
1515 }
1516
1517 static int dev_boot_phase = 1;
1518
1519 /**
1520  *      register_netdevice_notifier - register a network notifier block
1521  *      @nb: notifier
1522  *
1523  *      Register a notifier to be called when network device events occur.
1524  *      The notifier passed is linked into the kernel structures and must
1525  *      not be reused until it has been unregistered. A negative errno code
1526  *      is returned on a failure.
1527  *
1528  *      When registered all registration and up events are replayed
1529  *      to the new notifier to allow device to have a race free
1530  *      view of the network device list.
1531  */
1532
1533 int register_netdevice_notifier(struct notifier_block *nb)
1534 {
1535         struct net_device *dev;
1536         struct net_device *last;
1537         struct net *net;
1538         int err;
1539
1540         rtnl_lock();
1541         err = raw_notifier_chain_register(&netdev_chain, nb);
1542         if (err)
1543                 goto unlock;
1544         if (dev_boot_phase)
1545                 goto unlock;
1546         for_each_net(net) {
1547                 for_each_netdev(net, dev) {
1548                         err = call_netdevice_notifier(nb, NETDEV_REGISTER, dev);
1549                         err = notifier_to_errno(err);
1550                         if (err)
1551                                 goto rollback;
1552
1553                         if (!(dev->flags & IFF_UP))
1554                                 continue;
1555
1556                         call_netdevice_notifier(nb, NETDEV_UP, dev);
1557                 }
1558         }
1559
1560 unlock:
1561         rtnl_unlock();
1562         return err;
1563
1564 rollback:
1565         last = dev;
1566         for_each_net(net) {
1567                 for_each_netdev(net, dev) {
1568                         if (dev == last)
1569                                 goto outroll;
1570
1571                         if (dev->flags & IFF_UP) {
1572                                 call_netdevice_notifier(nb, NETDEV_GOING_DOWN,
1573                                                         dev);
1574                                 call_netdevice_notifier(nb, NETDEV_DOWN, dev);
1575                         }
1576                         call_netdevice_notifier(nb, NETDEV_UNREGISTER, dev);
1577                 }
1578         }
1579
1580 outroll:
1581         raw_notifier_chain_unregister(&netdev_chain, nb);
1582         goto unlock;
1583 }
1584 EXPORT_SYMBOL(register_netdevice_notifier);
1585
1586 /**
1587  *      unregister_netdevice_notifier - unregister a network notifier block
1588  *      @nb: notifier
1589  *
1590  *      Unregister a notifier previously registered by
1591  *      register_netdevice_notifier(). The notifier is unlinked into the
1592  *      kernel structures and may then be reused. A negative errno code
1593  *      is returned on a failure.
1594  *
1595  *      After unregistering unregister and down device events are synthesized
1596  *      for all devices on the device list to the removed notifier to remove
1597  *      the need for special case cleanup code.
1598  */
1599
1600 int unregister_netdevice_notifier(struct notifier_block *nb)
1601 {
1602         struct net_device *dev;
1603         struct net *net;
1604         int err;
1605
1606         rtnl_lock();
1607         err = raw_notifier_chain_unregister(&netdev_chain, nb);
1608         if (err)
1609                 goto unlock;
1610
1611         for_each_net(net) {
1612                 for_each_netdev(net, dev) {
1613                         if (dev->flags & IFF_UP) {
1614                                 call_netdevice_notifier(nb, NETDEV_GOING_DOWN,
1615                                                         dev);
1616                                 call_netdevice_notifier(nb, NETDEV_DOWN, dev);
1617                         }
1618                         call_netdevice_notifier(nb, NETDEV_UNREGISTER, dev);
1619                 }
1620         }
1621 unlock:
1622         rtnl_unlock();
1623         return err;
1624 }
1625 EXPORT_SYMBOL(unregister_netdevice_notifier);
1626
1627 /**
1628  *      call_netdevice_notifiers_info - call all network notifier blocks
1629  *      @val: value passed unmodified to notifier function
1630  *      @dev: net_device pointer passed unmodified to notifier function
1631  *      @info: notifier information data
1632  *
1633  *      Call all network notifier blocks.  Parameters and return value
1634  *      are as for raw_notifier_call_chain().
1635  */
1636
1637 static int call_netdevice_notifiers_info(unsigned long val,
1638                                          struct net_device *dev,
1639                                          struct netdev_notifier_info *info)
1640 {
1641         ASSERT_RTNL();
1642         netdev_notifier_info_init(info, dev);
1643         return raw_notifier_call_chain(&netdev_chain, val, info);
1644 }
1645
1646 /**
1647  *      call_netdevice_notifiers - call all network notifier blocks
1648  *      @val: value passed unmodified to notifier function
1649  *      @dev: net_device pointer passed unmodified to notifier function
1650  *
1651  *      Call all network notifier blocks.  Parameters and return value
1652  *      are as for raw_notifier_call_chain().
1653  */
1654
1655 int call_netdevice_notifiers(unsigned long val, struct net_device *dev)
1656 {
1657         struct netdev_notifier_info info;
1658
1659         return call_netdevice_notifiers_info(val, dev, &info);
1660 }
1661 EXPORT_SYMBOL(call_netdevice_notifiers);
1662
1663 /**
1664  *      call_netdevice_notifiers_mtu - call all network notifier blocks
1665  *      @val: value passed unmodified to notifier function
1666  *      @dev: net_device pointer passed unmodified to notifier function
1667  *      @arg: additional u32 argument passed to the notifier function
1668  *
1669  *      Call all network notifier blocks.  Parameters and return value
1670  *      are as for raw_notifier_call_chain().
1671  */
1672 static int call_netdevice_notifiers_mtu(unsigned long val,
1673                                         struct net_device *dev, u32 arg)
1674 {
1675         struct netdev_notifier_info_ext info = {
1676                 .info.dev = dev,
1677                 .ext.mtu = arg,
1678         };
1679
1680         BUILD_BUG_ON(offsetof(struct netdev_notifier_info_ext, info) != 0);
1681
1682         return call_netdevice_notifiers_info(val, dev, &info.info);
1683 }
1684
1685 #ifdef CONFIG_NET_INGRESS
1686 static struct static_key ingress_needed __read_mostly;
1687
1688 void net_inc_ingress_queue(void)
1689 {
1690         static_key_slow_inc(&ingress_needed);
1691 }
1692 EXPORT_SYMBOL_GPL(net_inc_ingress_queue);
1693
1694 void net_dec_ingress_queue(void)
1695 {
1696         static_key_slow_dec(&ingress_needed);
1697 }
1698 EXPORT_SYMBOL_GPL(net_dec_ingress_queue);
1699 #endif
1700
1701 static struct static_key netstamp_needed __read_mostly;
1702 #ifdef HAVE_JUMP_LABEL
1703 static atomic_t netstamp_needed_deferred;
1704 static atomic_t netstamp_wanted;
1705 static void netstamp_clear(struct work_struct *work)
1706 {
1707         int deferred = atomic_xchg(&netstamp_needed_deferred, 0);
1708         int wanted;
1709
1710         wanted = atomic_add_return(deferred, &netstamp_wanted);
1711         if (wanted > 0)
1712                 static_key_enable(&netstamp_needed);
1713         else
1714                 static_key_disable(&netstamp_needed);
1715 }
1716 static DECLARE_WORK(netstamp_work, netstamp_clear);
1717 #endif
1718
1719 void net_enable_timestamp(void)
1720 {
1721 #ifdef HAVE_JUMP_LABEL
1722         int wanted;
1723
1724         while (1) {
1725                 wanted = atomic_read(&netstamp_wanted);
1726                 if (wanted <= 0)
1727                         break;
1728                 if (atomic_cmpxchg(&netstamp_wanted, wanted, wanted + 1) == wanted)
1729                         return;
1730         }
1731         atomic_inc(&netstamp_needed_deferred);
1732         schedule_work(&netstamp_work);
1733 #else
1734         static_key_slow_inc(&netstamp_needed);
1735 #endif
1736 }
1737 EXPORT_SYMBOL(net_enable_timestamp);
1738
1739 void net_disable_timestamp(void)
1740 {
1741 #ifdef HAVE_JUMP_LABEL
1742         int wanted;
1743
1744         while (1) {
1745                 wanted = atomic_read(&netstamp_wanted);
1746                 if (wanted <= 1)
1747                         break;
1748                 if (atomic_cmpxchg(&netstamp_wanted, wanted, wanted - 1) == wanted)
1749                         return;
1750         }
1751         atomic_dec(&netstamp_needed_deferred);
1752         schedule_work(&netstamp_work);
1753 #else
1754         static_key_slow_dec(&netstamp_needed);
1755 #endif
1756 }
1757 EXPORT_SYMBOL(net_disable_timestamp);
1758
1759 static inline void net_timestamp_set(struct sk_buff *skb)
1760 {
1761         skb->tstamp.tv64 = 0;
1762         if (static_key_false(&netstamp_needed))
1763                 __net_timestamp(skb);
1764 }
1765
1766 #define net_timestamp_check(COND, SKB)                  \
1767         if (static_key_false(&netstamp_needed)) {               \
1768                 if ((COND) && !(SKB)->tstamp.tv64)      \
1769                         __net_timestamp(SKB);           \
1770         }                                               \
1771
1772 bool is_skb_forwardable(struct net_device *dev, struct sk_buff *skb)
1773 {
1774         unsigned int len;
1775
1776         if (!(dev->flags & IFF_UP))
1777                 return false;
1778
1779         len = dev->mtu + dev->hard_header_len + VLAN_HLEN;
1780         if (skb->len <= len)
1781                 return true;
1782
1783         /* if TSO is enabled, we don't care about the length as the packet
1784          * could be forwarded without being segmented before
1785          */
1786         if (skb_is_gso(skb))
1787                 return true;
1788
1789         return false;
1790 }
1791 EXPORT_SYMBOL_GPL(is_skb_forwardable);
1792
1793 int __dev_forward_skb(struct net_device *dev, struct sk_buff *skb)
1794 {
1795         if (skb_orphan_frags(skb, GFP_ATOMIC) ||
1796             unlikely(!is_skb_forwardable(dev, skb))) {
1797                 atomic_long_inc(&dev->rx_dropped);
1798                 kfree_skb(skb);
1799                 return NET_RX_DROP;
1800         }
1801
1802         skb_scrub_packet(skb, true);
1803         skb->priority = 0;
1804         skb->protocol = eth_type_trans(skb, dev);
1805         skb_postpull_rcsum(skb, eth_hdr(skb), ETH_HLEN);
1806
1807         return 0;
1808 }
1809 EXPORT_SYMBOL_GPL(__dev_forward_skb);
1810
1811 /**
1812  * dev_forward_skb - loopback an skb to another netif
1813  *
1814  * @dev: destination network device
1815  * @skb: buffer to forward
1816  *
1817  * return values:
1818  *      NET_RX_SUCCESS  (no congestion)
1819  *      NET_RX_DROP     (packet was dropped, but freed)
1820  *
1821  * dev_forward_skb can be used for injecting an skb from the
1822  * start_xmit function of one device into the receive queue
1823  * of another device.
1824  *
1825  * The receiving device may be in another namespace, so
1826  * we have to clear all information in the skb that could
1827  * impact namespace isolation.
1828  */
1829 int dev_forward_skb(struct net_device *dev, struct sk_buff *skb)
1830 {
1831         return __dev_forward_skb(dev, skb) ?: netif_rx_internal(skb);
1832 }
1833 EXPORT_SYMBOL_GPL(dev_forward_skb);
1834
1835 static inline int deliver_skb(struct sk_buff *skb,
1836                               struct packet_type *pt_prev,
1837                               struct net_device *orig_dev)
1838 {
1839         if (unlikely(skb_orphan_frags(skb, GFP_ATOMIC)))
1840                 return -ENOMEM;
1841         atomic_inc(&skb->users);
1842         return pt_prev->func(skb, skb->dev, pt_prev, orig_dev);
1843 }
1844
1845 static inline void deliver_ptype_list_skb(struct sk_buff *skb,
1846                                           struct packet_type **pt,
1847                                           struct net_device *orig_dev,
1848                                           __be16 type,
1849                                           struct list_head *ptype_list)
1850 {
1851         struct packet_type *ptype, *pt_prev = *pt;
1852
1853         list_for_each_entry_rcu(ptype, ptype_list, list) {
1854                 if (ptype->type != type)
1855                         continue;
1856                 if (pt_prev)
1857                         deliver_skb(skb, pt_prev, orig_dev);
1858                 pt_prev = ptype;
1859         }
1860         *pt = pt_prev;
1861 }
1862
1863 static inline bool skb_loop_sk(struct packet_type *ptype, struct sk_buff *skb)
1864 {
1865         if (!ptype->af_packet_priv || !skb->sk)
1866                 return false;
1867
1868         if (ptype->id_match)
1869                 return ptype->id_match(ptype, skb->sk);
1870         else if ((struct sock *)ptype->af_packet_priv == skb->sk)
1871                 return true;
1872
1873         return false;
1874 }
1875
1876 /*
1877  *      Support routine. Sends outgoing frames to any network
1878  *      taps currently in use.
1879  */
1880
1881 static void dev_queue_xmit_nit(struct sk_buff *skb, struct net_device *dev)
1882 {
1883         struct packet_type *ptype;
1884         struct sk_buff *skb2 = NULL;
1885         struct packet_type *pt_prev = NULL;
1886         struct list_head *ptype_list = &ptype_all;
1887
1888         rcu_read_lock();
1889 again:
1890         list_for_each_entry_rcu(ptype, ptype_list, list) {
1891                 /* Never send packets back to the socket
1892                  * they originated from - MvS (miquels@drinkel.ow.org)
1893                  */
1894                 if (skb_loop_sk(ptype, skb))
1895                         continue;
1896
1897                 if (pt_prev) {
1898                         deliver_skb(skb2, pt_prev, skb->dev);
1899                         pt_prev = ptype;
1900                         continue;
1901                 }
1902
1903                 /* need to clone skb, done only once */
1904                 skb2 = skb_clone(skb, GFP_ATOMIC);
1905                 if (!skb2)
1906                         goto out_unlock;
1907
1908                 net_timestamp_set(skb2);
1909
1910                 /* skb->nh should be correctly
1911                  * set by sender, so that the second statement is
1912                  * just protection against buggy protocols.
1913                  */
1914                 skb_reset_mac_header(skb2);
1915
1916                 if (skb_network_header(skb2) < skb2->data ||
1917                     skb_network_header(skb2) > skb_tail_pointer(skb2)) {
1918                         net_crit_ratelimited("protocol %04x is buggy, dev %s\n",
1919                                              ntohs(skb2->protocol),
1920                                              dev->name);
1921                         skb_reset_network_header(skb2);
1922                 }
1923
1924                 skb2->transport_header = skb2->network_header;
1925                 skb2->pkt_type = PACKET_OUTGOING;
1926                 pt_prev = ptype;
1927         }
1928
1929         if (ptype_list == &ptype_all) {
1930                 ptype_list = &dev->ptype_all;
1931                 goto again;
1932         }
1933 out_unlock:
1934         if (pt_prev)
1935                 pt_prev->func(skb2, skb->dev, pt_prev, skb->dev);
1936         rcu_read_unlock();
1937 }
1938
1939 /**
1940  * netif_setup_tc - Handle tc mappings on real_num_tx_queues change
1941  * @dev: Network device
1942  * @txq: number of queues available
1943  *
1944  * If real_num_tx_queues is changed the tc mappings may no longer be
1945  * valid. To resolve this verify the tc mapping remains valid and if
1946  * not NULL the mapping. With no priorities mapping to this
1947  * offset/count pair it will no longer be used. In the worst case TC0
1948  * is invalid nothing can be done so disable priority mappings. If is
1949  * expected that drivers will fix this mapping if they can before
1950  * calling netif_set_real_num_tx_queues.
1951  */
1952 static void netif_setup_tc(struct net_device *dev, unsigned int txq)
1953 {
1954         int i;
1955         struct netdev_tc_txq *tc = &dev->tc_to_txq[0];
1956
1957         /* If TC0 is invalidated disable TC mapping */
1958         if (tc->offset + tc->count > txq) {
1959                 pr_warn("Number of in use tx queues changed invalidating tc mappings. Priority traffic classification disabled!\n");
1960                 dev->num_tc = 0;
1961                 return;
1962         }
1963
1964         /* Invalidated prio to tc mappings set to TC0 */
1965         for (i = 1; i < TC_BITMASK + 1; i++) {
1966                 int q = netdev_get_prio_tc_map(dev, i);
1967
1968                 tc = &dev->tc_to_txq[q];
1969                 if (tc->offset + tc->count > txq) {
1970                         pr_warn("Number of in use tx queues changed. Priority %i to tc mapping %i is no longer valid. Setting map to 0\n",
1971                                 i, q);
1972                         netdev_set_prio_tc_map(dev, i, 0);
1973                 }
1974         }
1975 }
1976
1977 #ifdef CONFIG_XPS
1978 static DEFINE_MUTEX(xps_map_mutex);
1979 #define xmap_dereference(P)             \
1980         rcu_dereference_protected((P), lockdep_is_held(&xps_map_mutex))
1981
1982 static struct xps_map *remove_xps_queue(struct xps_dev_maps *dev_maps,
1983                                         int cpu, u16 index)
1984 {
1985         struct xps_map *map = NULL;
1986         int pos;
1987
1988         if (dev_maps)
1989                 map = xmap_dereference(dev_maps->cpu_map[cpu]);
1990
1991         for (pos = 0; map && pos < map->len; pos++) {
1992                 if (map->queues[pos] == index) {
1993                         if (map->len > 1) {
1994                                 map->queues[pos] = map->queues[--map->len];
1995                         } else {
1996                                 RCU_INIT_POINTER(dev_maps->cpu_map[cpu], NULL);
1997                                 kfree_rcu(map, rcu);
1998                                 map = NULL;
1999                         }
2000                         break;
2001                 }
2002         }
2003
2004         return map;
2005 }
2006
2007 static void netif_reset_xps_queues_gt(struct net_device *dev, u16 index)
2008 {
2009         struct xps_dev_maps *dev_maps;
2010         int cpu, i;
2011         bool active = false;
2012
2013         mutex_lock(&xps_map_mutex);
2014         dev_maps = xmap_dereference(dev->xps_maps);
2015
2016         if (!dev_maps)
2017                 goto out_no_maps;
2018
2019         for_each_possible_cpu(cpu) {
2020                 for (i = index; i < dev->num_tx_queues; i++) {
2021                         if (!remove_xps_queue(dev_maps, cpu, i))
2022                                 break;
2023                 }
2024                 if (i == dev->num_tx_queues)
2025                         active = true;
2026         }
2027
2028         if (!active) {
2029                 RCU_INIT_POINTER(dev->xps_maps, NULL);
2030                 kfree_rcu(dev_maps, rcu);
2031         }
2032
2033         for (i = index; i < dev->num_tx_queues; i++)
2034                 netdev_queue_numa_node_write(netdev_get_tx_queue(dev, i),
2035                                              NUMA_NO_NODE);
2036
2037 out_no_maps:
2038         mutex_unlock(&xps_map_mutex);
2039 }
2040
2041 static struct xps_map *expand_xps_map(struct xps_map *map,
2042                                       int cpu, u16 index)
2043 {
2044         struct xps_map *new_map;
2045         int alloc_len = XPS_MIN_MAP_ALLOC;
2046         int i, pos;
2047
2048         for (pos = 0; map && pos < map->len; pos++) {
2049                 if (map->queues[pos] != index)
2050                         continue;
2051                 return map;
2052         }
2053
2054         /* Need to add queue to this CPU's existing map */
2055         if (map) {
2056                 if (pos < map->alloc_len)
2057                         return map;
2058
2059                 alloc_len = map->alloc_len * 2;
2060         }
2061
2062         /* Need to allocate new map to store queue on this CPU's map */
2063         new_map = kzalloc_node(XPS_MAP_SIZE(alloc_len), GFP_KERNEL,
2064                                cpu_to_node(cpu));
2065         if (!new_map)
2066                 return NULL;
2067
2068         for (i = 0; i < pos; i++)
2069                 new_map->queues[i] = map->queues[i];
2070         new_map->alloc_len = alloc_len;
2071         new_map->len = pos;
2072
2073         return new_map;
2074 }
2075
2076 int netif_set_xps_queue(struct net_device *dev, const struct cpumask *mask,
2077                         u16 index)
2078 {
2079         struct xps_dev_maps *dev_maps, *new_dev_maps = NULL;
2080         struct xps_map *map, *new_map;
2081         int maps_sz = max_t(unsigned int, XPS_DEV_MAPS_SIZE, L1_CACHE_BYTES);
2082         int cpu, numa_node_id = -2;
2083         bool active = false;
2084
2085         mutex_lock(&xps_map_mutex);
2086
2087         dev_maps = xmap_dereference(dev->xps_maps);
2088
2089         /* allocate memory for queue storage */
2090         for_each_online_cpu(cpu) {
2091                 if (!cpumask_test_cpu(cpu, mask))
2092                         continue;
2093
2094                 if (!new_dev_maps)
2095                         new_dev_maps = kzalloc(maps_sz, GFP_KERNEL);
2096                 if (!new_dev_maps) {
2097                         mutex_unlock(&xps_map_mutex);
2098                         return -ENOMEM;
2099                 }
2100
2101                 map = dev_maps ? xmap_dereference(dev_maps->cpu_map[cpu]) :
2102                                  NULL;
2103
2104                 map = expand_xps_map(map, cpu, index);
2105                 if (!map)
2106                         goto error;
2107
2108                 RCU_INIT_POINTER(new_dev_maps->cpu_map[cpu], map);
2109         }
2110
2111         if (!new_dev_maps)
2112                 goto out_no_new_maps;
2113
2114         for_each_possible_cpu(cpu) {
2115                 if (cpumask_test_cpu(cpu, mask) && cpu_online(cpu)) {
2116                         /* add queue to CPU maps */
2117                         int pos = 0;
2118
2119                         map = xmap_dereference(new_dev_maps->cpu_map[cpu]);
2120                         while ((pos < map->len) && (map->queues[pos] != index))
2121                                 pos++;
2122
2123                         if (pos == map->len)
2124                                 map->queues[map->len++] = index;
2125 #ifdef CONFIG_NUMA
2126                         if (numa_node_id == -2)
2127                                 numa_node_id = cpu_to_node(cpu);
2128                         else if (numa_node_id != cpu_to_node(cpu))
2129                                 numa_node_id = -1;
2130 #endif
2131                 } else if (dev_maps) {
2132                         /* fill in the new device map from the old device map */
2133                         map = xmap_dereference(dev_maps->cpu_map[cpu]);
2134                         RCU_INIT_POINTER(new_dev_maps->cpu_map[cpu], map);
2135                 }
2136
2137         }
2138
2139         rcu_assign_pointer(dev->xps_maps, new_dev_maps);
2140
2141         /* Cleanup old maps */
2142         if (dev_maps) {
2143                 for_each_possible_cpu(cpu) {
2144                         new_map = xmap_dereference(new_dev_maps->cpu_map[cpu]);
2145                         map = xmap_dereference(dev_maps->cpu_map[cpu]);
2146                         if (map && map != new_map)
2147                                 kfree_rcu(map, rcu);
2148                 }
2149
2150                 kfree_rcu(dev_maps, rcu);
2151         }
2152
2153         dev_maps = new_dev_maps;
2154         active = true;
2155
2156 out_no_new_maps:
2157         /* update Tx queue numa node */
2158         netdev_queue_numa_node_write(netdev_get_tx_queue(dev, index),
2159                                      (numa_node_id >= 0) ? numa_node_id :
2160                                      NUMA_NO_NODE);
2161
2162         if (!dev_maps)
2163                 goto out_no_maps;
2164
2165         /* removes queue from unused CPUs */
2166         for_each_possible_cpu(cpu) {
2167                 if (cpumask_test_cpu(cpu, mask) && cpu_online(cpu))
2168                         continue;
2169
2170                 if (remove_xps_queue(dev_maps, cpu, index))
2171                         active = true;
2172         }
2173
2174         /* free map if not active */
2175         if (!active) {
2176                 RCU_INIT_POINTER(dev->xps_maps, NULL);
2177                 kfree_rcu(dev_maps, rcu);
2178         }
2179
2180 out_no_maps:
2181         mutex_unlock(&xps_map_mutex);
2182
2183         return 0;
2184 error:
2185         /* remove any maps that we added */
2186         for_each_possible_cpu(cpu) {
2187                 new_map = xmap_dereference(new_dev_maps->cpu_map[cpu]);
2188                 map = dev_maps ? xmap_dereference(dev_maps->cpu_map[cpu]) :
2189                                  NULL;
2190                 if (new_map && new_map != map)
2191                         kfree(new_map);
2192         }
2193
2194         mutex_unlock(&xps_map_mutex);
2195
2196         kfree(new_dev_maps);
2197         return -ENOMEM;
2198 }
2199 EXPORT_SYMBOL(netif_set_xps_queue);
2200
2201 #endif
2202 /*
2203  * Routine to help set real_num_tx_queues. To avoid skbs mapped to queues
2204  * greater then real_num_tx_queues stale skbs on the qdisc must be flushed.
2205  */
2206 int netif_set_real_num_tx_queues(struct net_device *dev, unsigned int txq)
2207 {
2208         bool disabling;
2209         int rc;
2210
2211         disabling = txq < dev->real_num_tx_queues;
2212
2213         if (txq < 1 || txq > dev->num_tx_queues)
2214                 return -EINVAL;
2215
2216         if (dev->reg_state == NETREG_REGISTERED ||
2217             dev->reg_state == NETREG_UNREGISTERING) {
2218                 ASSERT_RTNL();
2219
2220                 rc = netdev_queue_update_kobjects(dev, dev->real_num_tx_queues,
2221                                                   txq);
2222                 if (rc)
2223                         return rc;
2224
2225                 if (dev->num_tc)
2226                         netif_setup_tc(dev, txq);
2227
2228                 dev->real_num_tx_queues = txq;
2229
2230                 if (disabling) {
2231                         synchronize_net();
2232                         qdisc_reset_all_tx_gt(dev, txq);
2233 #ifdef CONFIG_XPS
2234                         netif_reset_xps_queues_gt(dev, txq);
2235 #endif
2236                 }
2237         } else {
2238                 dev->real_num_tx_queues = txq;
2239         }
2240
2241         return 0;
2242 }
2243 EXPORT_SYMBOL(netif_set_real_num_tx_queues);
2244
2245 #ifdef CONFIG_SYSFS
2246 /**
2247  *      netif_set_real_num_rx_queues - set actual number of RX queues used
2248  *      @dev: Network device
2249  *      @rxq: Actual number of RX queues
2250  *
2251  *      This must be called either with the rtnl_lock held or before
2252  *      registration of the net device.  Returns 0 on success, or a
2253  *      negative error code.  If called before registration, it always
2254  *      succeeds.
2255  */
2256 int netif_set_real_num_rx_queues(struct net_device *dev, unsigned int rxq)
2257 {
2258         int rc;
2259
2260         if (rxq < 1 || rxq > dev->num_rx_queues)
2261                 return -EINVAL;
2262
2263         if (dev->reg_state == NETREG_REGISTERED) {
2264                 ASSERT_RTNL();
2265
2266                 rc = net_rx_queue_update_kobjects(dev, dev->real_num_rx_queues,
2267                                                   rxq);
2268                 if (rc)
2269                         return rc;
2270         }
2271
2272         dev->real_num_rx_queues = rxq;
2273         return 0;
2274 }
2275 EXPORT_SYMBOL(netif_set_real_num_rx_queues);
2276 #endif
2277
2278 /**
2279  * netif_get_num_default_rss_queues - default number of RSS queues
2280  *
2281  * This routine should set an upper limit on the number of RSS queues
2282  * used by default by multiqueue devices.
2283  */
2284 int netif_get_num_default_rss_queues(void)
2285 {
2286         return min_t(int, DEFAULT_MAX_NUM_RSS_QUEUES, num_online_cpus());
2287 }
2288 EXPORT_SYMBOL(netif_get_num_default_rss_queues);
2289
2290 static inline void __netif_reschedule(struct Qdisc *q)
2291 {
2292         struct softnet_data *sd;
2293         unsigned long flags;
2294
2295         local_irq_save(flags);
2296         sd = this_cpu_ptr(&softnet_data);
2297         q->next_sched = NULL;
2298         *sd->output_queue_tailp = q;
2299         sd->output_queue_tailp = &q->next_sched;
2300         raise_softirq_irqoff(NET_TX_SOFTIRQ);
2301         local_irq_restore(flags);
2302 }
2303
2304 void __netif_schedule(struct Qdisc *q)
2305 {
2306         if (!test_and_set_bit(__QDISC_STATE_SCHED, &q->state))
2307                 __netif_reschedule(q);
2308 }
2309 EXPORT_SYMBOL(__netif_schedule);
2310
2311 struct dev_kfree_skb_cb {
2312         enum skb_free_reason reason;
2313 };
2314
2315 static struct dev_kfree_skb_cb *get_kfree_skb_cb(const struct sk_buff *skb)
2316 {
2317         return (struct dev_kfree_skb_cb *)skb->cb;
2318 }
2319
2320 void netif_schedule_queue(struct netdev_queue *txq)
2321 {
2322         rcu_read_lock();
2323         if (!(txq->state & QUEUE_STATE_ANY_XOFF)) {
2324                 struct Qdisc *q = rcu_dereference(txq->qdisc);
2325
2326                 __netif_schedule(q);
2327         }
2328         rcu_read_unlock();
2329 }
2330 EXPORT_SYMBOL(netif_schedule_queue);
2331
2332 /**
2333  *      netif_wake_subqueue - allow sending packets on subqueue
2334  *      @dev: network device
2335  *      @queue_index: sub queue index
2336  *
2337  * Resume individual transmit queue of a device with multiple transmit queues.
2338  */
2339 void netif_wake_subqueue(struct net_device *dev, u16 queue_index)
2340 {
2341         struct netdev_queue *txq = netdev_get_tx_queue(dev, queue_index);
2342
2343         if (test_and_clear_bit(__QUEUE_STATE_DRV_XOFF, &txq->state)) {
2344                 struct Qdisc *q;
2345
2346                 rcu_read_lock();
2347                 q = rcu_dereference(txq->qdisc);
2348                 __netif_schedule(q);
2349                 rcu_read_unlock();
2350         }
2351 }
2352 EXPORT_SYMBOL(netif_wake_subqueue);
2353
2354 void netif_tx_wake_queue(struct netdev_queue *dev_queue)
2355 {
2356         if (test_and_clear_bit(__QUEUE_STATE_DRV_XOFF, &dev_queue->state)) {
2357                 struct Qdisc *q;
2358
2359                 rcu_read_lock();
2360                 q = rcu_dereference(dev_queue->qdisc);
2361                 __netif_schedule(q);
2362                 rcu_read_unlock();
2363         }
2364 }
2365 EXPORT_SYMBOL(netif_tx_wake_queue);
2366
2367 void __dev_kfree_skb_irq(struct sk_buff *skb, enum skb_free_reason reason)
2368 {
2369         unsigned long flags;
2370
2371         if (unlikely(!skb))
2372                 return;
2373
2374         if (likely(atomic_read(&skb->users) == 1)) {
2375                 smp_rmb();
2376                 atomic_set(&skb->users, 0);
2377         } else if (likely(!atomic_dec_and_test(&skb->users))) {
2378                 return;
2379         }
2380         get_kfree_skb_cb(skb)->reason = reason;
2381         local_irq_save(flags);
2382         skb->next = __this_cpu_read(softnet_data.completion_queue);
2383         __this_cpu_write(softnet_data.completion_queue, skb);
2384         raise_softirq_irqoff(NET_TX_SOFTIRQ);
2385         local_irq_restore(flags);
2386 }
2387 EXPORT_SYMBOL(__dev_kfree_skb_irq);
2388
2389 void __dev_kfree_skb_any(struct sk_buff *skb, enum skb_free_reason reason)
2390 {
2391         if (in_irq() || irqs_disabled())
2392                 __dev_kfree_skb_irq(skb, reason);
2393         else
2394                 dev_kfree_skb(skb);
2395 }
2396 EXPORT_SYMBOL(__dev_kfree_skb_any);
2397
2398
2399 /**
2400  * netif_device_detach - mark device as removed
2401  * @dev: network device
2402  *
2403  * Mark device as removed from system and therefore no longer available.
2404  */
2405 void netif_device_detach(struct net_device *dev)
2406 {
2407         if (test_and_clear_bit(__LINK_STATE_PRESENT, &dev->state) &&
2408             netif_running(dev)) {
2409                 netif_tx_stop_all_queues(dev);
2410         }
2411 }
2412 EXPORT_SYMBOL(netif_device_detach);
2413
2414 /**
2415  * netif_device_attach - mark device as attached
2416  * @dev: network device
2417  *
2418  * Mark device as attached from system and restart if needed.
2419  */
2420 void netif_device_attach(struct net_device *dev)
2421 {
2422         if (!test_and_set_bit(__LINK_STATE_PRESENT, &dev->state) &&
2423             netif_running(dev)) {
2424                 netif_tx_wake_all_queues(dev);
2425                 __netdev_watchdog_up(dev);
2426         }
2427 }
2428 EXPORT_SYMBOL(netif_device_attach);
2429
2430 /*
2431  * Returns a Tx hash based on the given packet descriptor a Tx queues' number
2432  * to be used as a distribution range.
2433  */
2434 u16 __skb_tx_hash(const struct net_device *dev, struct sk_buff *skb,
2435                   unsigned int num_tx_queues)
2436 {
2437         u32 hash;
2438         u16 qoffset = 0;
2439         u16 qcount = num_tx_queues;
2440
2441         if (skb_rx_queue_recorded(skb)) {
2442                 hash = skb_get_rx_queue(skb);
2443                 while (unlikely(hash >= num_tx_queues))
2444                         hash -= num_tx_queues;
2445                 return hash;
2446         }
2447
2448         if (dev->num_tc) {
2449                 u8 tc = netdev_get_prio_tc_map(dev, skb->priority);
2450                 qoffset = dev->tc_to_txq[tc].offset;
2451                 qcount = dev->tc_to_txq[tc].count;
2452         }
2453
2454         return (u16) reciprocal_scale(skb_get_hash(skb), qcount) + qoffset;
2455 }
2456 EXPORT_SYMBOL(__skb_tx_hash);
2457
2458 static void skb_warn_bad_offload(const struct sk_buff *skb)
2459 {
2460         static const netdev_features_t null_features = 0;
2461         struct net_device *dev = skb->dev;
2462         const char *name = "";
2463
2464         if (!net_ratelimit())
2465                 return;
2466
2467         if (dev) {
2468                 if (dev->dev.parent)
2469                         name = dev_driver_string(dev->dev.parent);
2470                 else
2471                         name = netdev_name(dev);
2472         }
2473         WARN(1, "%s: caps=(%pNF, %pNF) len=%d data_len=%d gso_size=%d "
2474              "gso_type=%d ip_summed=%d\n",
2475              name, dev ? &dev->features : &null_features,
2476              skb->sk ? &skb->sk->sk_route_caps : &null_features,
2477              skb->len, skb->data_len, skb_shinfo(skb)->gso_size,
2478              skb_shinfo(skb)->gso_type, skb->ip_summed);
2479 }
2480
2481 /*
2482  * Invalidate hardware checksum when packet is to be mangled, and
2483  * complete checksum manually on outgoing path.
2484  */
2485 int skb_checksum_help(struct sk_buff *skb)
2486 {
2487         __wsum csum;
2488         int ret = 0, offset;
2489
2490         if (skb->ip_summed == CHECKSUM_COMPLETE)
2491                 goto out_set_summed;
2492
2493         if (unlikely(skb_shinfo(skb)->gso_size)) {
2494                 skb_warn_bad_offload(skb);
2495                 return -EINVAL;
2496         }
2497
2498         /* Before computing a checksum, we should make sure no frag could
2499          * be modified by an external entity : checksum could be wrong.
2500          */
2501         if (skb_has_shared_frag(skb)) {
2502                 ret = __skb_linearize(skb);
2503                 if (ret)
2504                         goto out;
2505         }
2506
2507         offset = skb_checksum_start_offset(skb);
2508         BUG_ON(offset >= skb_headlen(skb));
2509         csum = skb_checksum(skb, offset, skb->len - offset, 0);
2510
2511         offset += skb->csum_offset;
2512         BUG_ON(offset + sizeof(__sum16) > skb_headlen(skb));
2513
2514         if (skb_cloned(skb) &&
2515             !skb_clone_writable(skb, offset + sizeof(__sum16))) {
2516                 ret = pskb_expand_head(skb, 0, 0, GFP_ATOMIC);
2517                 if (ret)
2518                         goto out;
2519         }
2520
2521         *(__sum16 *)(skb->data + offset) = csum_fold(csum) ?: CSUM_MANGLED_0;
2522 out_set_summed:
2523         skb->ip_summed = CHECKSUM_NONE;
2524 out:
2525         return ret;
2526 }
2527 EXPORT_SYMBOL(skb_checksum_help);
2528
2529 __be16 skb_network_protocol(struct sk_buff *skb, int *depth)
2530 {
2531         __be16 type = skb->protocol;
2532
2533         /* Tunnel gso handlers can set protocol to ethernet. */
2534         if (type == htons(ETH_P_TEB)) {
2535                 struct ethhdr *eth;
2536
2537                 if (unlikely(!pskb_may_pull(skb, sizeof(struct ethhdr))))
2538                         return 0;
2539
2540                 eth = (struct ethhdr *)skb->data;
2541                 type = eth->h_proto;
2542         }
2543
2544         return __vlan_get_protocol(skb, type, depth);
2545 }
2546
2547 /**
2548  *      skb_mac_gso_segment - mac layer segmentation handler.
2549  *      @skb: buffer to segment
2550  *      @features: features for the output path (see dev->features)
2551  */
2552 struct sk_buff *skb_mac_gso_segment(struct sk_buff *skb,
2553                                     netdev_features_t features)
2554 {
2555         struct sk_buff *segs = ERR_PTR(-EPROTONOSUPPORT);
2556         struct packet_offload *ptype;
2557         int vlan_depth = skb->mac_len;
2558         __be16 type = skb_network_protocol(skb, &vlan_depth);
2559
2560         if (unlikely(!type))
2561                 return ERR_PTR(-EINVAL);
2562
2563         __skb_pull(skb, vlan_depth);
2564
2565         rcu_read_lock();
2566         list_for_each_entry_rcu(ptype, &offload_base, list) {
2567                 if (ptype->type == type && ptype->callbacks.gso_segment) {
2568                         segs = ptype->callbacks.gso_segment(skb, features);
2569                         break;
2570                 }
2571         }
2572         rcu_read_unlock();
2573
2574         __skb_push(skb, skb->data - skb_mac_header(skb));
2575
2576         return segs;
2577 }
2578 EXPORT_SYMBOL(skb_mac_gso_segment);
2579
2580
2581 /* openvswitch calls this on rx path, so we need a different check.
2582  */
2583 static inline bool skb_needs_check(struct sk_buff *skb, bool tx_path)
2584 {
2585         if (tx_path)
2586                 return skb->ip_summed != CHECKSUM_PARTIAL &&
2587                        skb->ip_summed != CHECKSUM_UNNECESSARY;
2588
2589         return skb->ip_summed == CHECKSUM_NONE;
2590 }
2591
2592 /**
2593  *      __skb_gso_segment - Perform segmentation on skb.
2594  *      @skb: buffer to segment
2595  *      @features: features for the output path (see dev->features)
2596  *      @tx_path: whether it is called in TX path
2597  *
2598  *      This function segments the given skb and returns a list of segments.
2599  *
2600  *      It may return NULL if the skb requires no segmentation.  This is
2601  *      only possible when GSO is used for verifying header integrity.
2602  *
2603  *      Segmentation preserves SKB_SGO_CB_OFFSET bytes of previous skb cb.
2604  */
2605 struct sk_buff *__skb_gso_segment(struct sk_buff *skb,
2606                                   netdev_features_t features, bool tx_path)
2607 {
2608         struct sk_buff *segs;
2609
2610         if (unlikely(skb_needs_check(skb, tx_path))) {
2611                 int err;
2612
2613                 /* We're going to init ->check field in TCP or UDP header */
2614                 err = skb_cow_head(skb, 0);
2615                 if (err < 0)
2616                         return ERR_PTR(err);
2617         }
2618
2619         BUILD_BUG_ON(SKB_SGO_CB_OFFSET +
2620                      sizeof(*SKB_GSO_CB(skb)) > sizeof(skb->cb));
2621
2622         SKB_GSO_CB(skb)->mac_offset = skb_headroom(skb);
2623         SKB_GSO_CB(skb)->encap_level = 0;
2624
2625         skb_reset_mac_header(skb);
2626         skb_reset_mac_len(skb);
2627
2628         segs = skb_mac_gso_segment(skb, features);
2629
2630         if (unlikely(skb_needs_check(skb, tx_path) && !IS_ERR(segs)))
2631                 skb_warn_bad_offload(skb);
2632
2633         return segs;
2634 }
2635 EXPORT_SYMBOL(__skb_gso_segment);
2636
2637 /* Take action when hardware reception checksum errors are detected. */
2638 #ifdef CONFIG_BUG
2639 void netdev_rx_csum_fault(struct net_device *dev)
2640 {
2641         if (net_ratelimit()) {
2642                 pr_err("%s: hw csum failure\n", dev ? dev->name : "<unknown>");
2643                 dump_stack();
2644         }
2645 }
2646 EXPORT_SYMBOL(netdev_rx_csum_fault);
2647 #endif
2648
2649 /* Actually, we should eliminate this check as soon as we know, that:
2650  * 1. IOMMU is present and allows to map all the memory.
2651  * 2. No high memory really exists on this machine.
2652  */
2653
2654 static int illegal_highdma(struct net_device *dev, struct sk_buff *skb)
2655 {
2656 #ifdef CONFIG_HIGHMEM
2657         int i;
2658         if (!(dev->features & NETIF_F_HIGHDMA)) {
2659                 for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
2660                         skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
2661                         if (PageHighMem(skb_frag_page(frag)))
2662                                 return 1;
2663                 }
2664         }
2665
2666         if (PCI_DMA_BUS_IS_PHYS) {
2667                 struct device *pdev = dev->dev.parent;
2668
2669                 if (!pdev)
2670                         return 0;
2671                 for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
2672                         skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
2673                         dma_addr_t addr = page_to_phys(skb_frag_page(frag));
2674                         if (!pdev->dma_mask || addr + PAGE_SIZE - 1 > *pdev->dma_mask)
2675                                 return 1;
2676                 }
2677         }
2678 #endif
2679         return 0;
2680 }
2681
2682 /* If MPLS offload request, verify we are testing hardware MPLS features
2683  * instead of standard features for the netdev.
2684  */
2685 #if IS_ENABLED(CONFIG_NET_MPLS_GSO)
2686 static netdev_features_t net_mpls_features(struct sk_buff *skb,
2687                                            netdev_features_t features,
2688                                            __be16 type)
2689 {
2690         if (eth_p_mpls(type))
2691                 features &= skb->dev->mpls_features;
2692
2693         return features;
2694 }
2695 #else
2696 static netdev_features_t net_mpls_features(struct sk_buff *skb,
2697                                            netdev_features_t features,
2698                                            __be16 type)
2699 {
2700         return features;
2701 }
2702 #endif
2703
2704 static netdev_features_t harmonize_features(struct sk_buff *skb,
2705         netdev_features_t features)
2706 {
2707         int tmp;
2708         __be16 type;
2709
2710         type = skb_network_protocol(skb, &tmp);
2711         features = net_mpls_features(skb, features, type);
2712
2713         if (skb->ip_summed != CHECKSUM_NONE &&
2714             !can_checksum_protocol(features, type)) {
2715                 features &= ~NETIF_F_ALL_CSUM;
2716         }
2717         if (illegal_highdma(skb->dev, skb))
2718                 features &= ~NETIF_F_SG;
2719
2720         return features;
2721 }
2722
2723 netdev_features_t passthru_features_check(struct sk_buff *skb,
2724                                           struct net_device *dev,
2725                                           netdev_features_t features)
2726 {
2727         return features;
2728 }
2729 EXPORT_SYMBOL(passthru_features_check);
2730
2731 static netdev_features_t dflt_features_check(struct sk_buff *skb,
2732                                              struct net_device *dev,
2733                                              netdev_features_t features)
2734 {
2735         return vlan_features_check(skb, features);
2736 }
2737
2738 netdev_features_t netif_skb_features(struct sk_buff *skb)
2739 {
2740         struct net_device *dev = skb->dev;
2741         netdev_features_t features = dev->features;
2742         u16 gso_segs = skb_shinfo(skb)->gso_segs;
2743
2744         if (gso_segs > dev->gso_max_segs || gso_segs < dev->gso_min_segs)
2745                 features &= ~NETIF_F_GSO_MASK;
2746
2747         /* If encapsulation offload request, verify we are testing
2748          * hardware encapsulation features instead of standard
2749          * features for the netdev
2750          */
2751         if (skb->encapsulation)
2752                 features &= dev->hw_enc_features;
2753
2754         if (skb_vlan_tagged(skb))
2755                 features = netdev_intersect_features(features,
2756                                                      dev->vlan_features |
2757                                                      NETIF_F_HW_VLAN_CTAG_TX |
2758                                                      NETIF_F_HW_VLAN_STAG_TX);
2759
2760         if (dev->netdev_ops->ndo_features_check)
2761                 features &= dev->netdev_ops->ndo_features_check(skb, dev,
2762                                                                 features);
2763         else
2764                 features &= dflt_features_check(skb, dev, features);
2765
2766         return harmonize_features(skb, features);
2767 }
2768 EXPORT_SYMBOL(netif_skb_features);
2769
2770 static int xmit_one(struct sk_buff *skb, struct net_device *dev,
2771                     struct netdev_queue *txq, bool more)
2772 {
2773         unsigned int len;
2774         int rc;
2775
2776         if (!list_empty(&ptype_all) || !list_empty(&dev->ptype_all))
2777                 dev_queue_xmit_nit(skb, dev);
2778
2779         len = skb->len;
2780         trace_net_dev_start_xmit(skb, dev);
2781         rc = netdev_start_xmit(skb, dev, txq, more);
2782         trace_net_dev_xmit(skb, rc, dev, len);
2783
2784         return rc;
2785 }
2786
2787 struct sk_buff *dev_hard_start_xmit(struct sk_buff *first, struct net_device *dev,
2788                                     struct netdev_queue *txq, int *ret)
2789 {
2790         struct sk_buff *skb = first;
2791         int rc = NETDEV_TX_OK;
2792
2793         while (skb) {
2794                 struct sk_buff *next = skb->next;
2795
2796                 skb->next = NULL;
2797                 rc = xmit_one(skb, dev, txq, next != NULL);
2798                 if (unlikely(!dev_xmit_complete(rc))) {
2799                         skb->next = next;
2800                         goto out;
2801                 }
2802
2803                 skb = next;
2804                 if (netif_xmit_stopped(txq) && skb) {
2805                         rc = NETDEV_TX_BUSY;
2806                         break;
2807                 }
2808         }
2809
2810 out:
2811         *ret = rc;
2812         return skb;
2813 }
2814
2815 static struct sk_buff *validate_xmit_vlan(struct sk_buff *skb,
2816                                           netdev_features_t features)
2817 {
2818         if (skb_vlan_tag_present(skb) &&
2819             !vlan_hw_offload_capable(features, skb->vlan_proto))
2820                 skb = __vlan_hwaccel_push_inside(skb);
2821         return skb;
2822 }
2823
2824 static struct sk_buff *validate_xmit_skb(struct sk_buff *skb, struct net_device *dev)
2825 {
2826         netdev_features_t features;
2827
2828         if (skb->next)
2829                 return skb;
2830
2831         features = netif_skb_features(skb);
2832         skb = validate_xmit_vlan(skb, features);
2833         if (unlikely(!skb))
2834                 goto out_null;
2835
2836         if (netif_needs_gso(skb, features)) {
2837                 struct sk_buff *segs;
2838
2839                 segs = skb_gso_segment(skb, features);
2840                 if (IS_ERR(segs)) {
2841                         goto out_kfree_skb;
2842                 } else if (segs) {
2843                         consume_skb(skb);
2844                         skb = segs;
2845                 }
2846         } else {
2847                 if (skb_needs_linearize(skb, features) &&
2848                     __skb_linearize(skb))
2849                         goto out_kfree_skb;
2850
2851                 /* If packet is not checksummed and device does not
2852                  * support checksumming for this protocol, complete
2853                  * checksumming here.
2854                  */
2855                 if (skb->ip_summed == CHECKSUM_PARTIAL) {
2856                         if (skb->encapsulation)
2857                                 skb_set_inner_transport_header(skb,
2858                                                                skb_checksum_start_offset(skb));
2859                         else
2860                                 skb_set_transport_header(skb,
2861                                                          skb_checksum_start_offset(skb));
2862                         if (!(features & NETIF_F_ALL_CSUM) &&
2863                             skb_checksum_help(skb))
2864                                 goto out_kfree_skb;
2865                 }
2866         }
2867
2868         return skb;
2869
2870 out_kfree_skb:
2871         kfree_skb(skb);
2872 out_null:
2873         return NULL;
2874 }
2875
2876 struct sk_buff *validate_xmit_skb_list(struct sk_buff *skb, struct net_device *dev)
2877 {
2878         struct sk_buff *next, *head = NULL, *tail;
2879
2880         for (; skb != NULL; skb = next) {
2881                 next = skb->next;
2882                 skb->next = NULL;
2883
2884                 /* in case skb wont be segmented, point to itself */
2885                 skb->prev = skb;
2886
2887                 skb = validate_xmit_skb(skb, dev);
2888                 if (!skb)
2889                         continue;
2890
2891                 if (!head)
2892                         head = skb;
2893                 else
2894                         tail->next = skb;
2895                 /* If skb was segmented, skb->prev points to
2896                  * the last segment. If not, it still contains skb.
2897                  */
2898                 tail = skb->prev;
2899         }
2900         return head;
2901 }
2902 EXPORT_SYMBOL_GPL(validate_xmit_skb_list);
2903
2904 static void qdisc_pkt_len_init(struct sk_buff *skb)
2905 {
2906         const struct skb_shared_info *shinfo = skb_shinfo(skb);
2907
2908         qdisc_skb_cb(skb)->pkt_len = skb->len;
2909
2910         /* To get more precise estimation of bytes sent on wire,
2911          * we add to pkt_len the headers size of all segments
2912          */
2913         if (shinfo->gso_size)  {
2914                 unsigned int hdr_len;
2915                 u16 gso_segs = shinfo->gso_segs;
2916
2917                 /* mac layer + network layer */
2918                 hdr_len = skb_transport_header(skb) - skb_mac_header(skb);
2919
2920                 /* + transport layer */
2921                 if (likely(shinfo->gso_type & (SKB_GSO_TCPV4 | SKB_GSO_TCPV6))) {
2922                         const struct tcphdr *th;
2923                         struct tcphdr _tcphdr;
2924
2925                         th = skb_header_pointer(skb, skb_transport_offset(skb),
2926                                                 sizeof(_tcphdr), &_tcphdr);
2927                         if (likely(th))
2928                                 hdr_len += __tcp_hdrlen(th);
2929                 } else {
2930                         struct udphdr _udphdr;
2931
2932                         if (skb_header_pointer(skb, skb_transport_offset(skb),
2933                                                sizeof(_udphdr), &_udphdr))
2934                                 hdr_len += sizeof(struct udphdr);
2935                 }
2936
2937                 if (shinfo->gso_type & SKB_GSO_DODGY)
2938                         gso_segs = DIV_ROUND_UP(skb->len - hdr_len,
2939                                                 shinfo->gso_size);
2940
2941                 qdisc_skb_cb(skb)->pkt_len += (gso_segs - 1) * hdr_len;
2942         }
2943 }
2944
2945 static inline int __dev_xmit_skb(struct sk_buff *skb, struct Qdisc *q,
2946                                  struct net_device *dev,
2947                                  struct netdev_queue *txq)
2948 {
2949         spinlock_t *root_lock = qdisc_lock(q);
2950         bool contended;
2951         int rc;
2952
2953         qdisc_pkt_len_init(skb);
2954         qdisc_calculate_pkt_len(skb, q);
2955         /*
2956          * Heuristic to force contended enqueues to serialize on a
2957          * separate lock before trying to get qdisc main lock.
2958          * This permits __QDISC___STATE_RUNNING owner to get the lock more
2959          * often and dequeue packets faster.
2960          */
2961         contended = qdisc_is_running(q);
2962         if (unlikely(contended))
2963                 spin_lock(&q->busylock);
2964
2965         spin_lock(root_lock);
2966         if (unlikely(test_bit(__QDISC_STATE_DEACTIVATED, &q->state))) {
2967                 kfree_skb(skb);
2968                 rc = NET_XMIT_DROP;
2969         } else if ((q->flags & TCQ_F_CAN_BYPASS) && !qdisc_qlen(q) &&
2970                    qdisc_run_begin(q)) {
2971                 /*
2972                  * This is a work-conserving queue; there are no old skbs
2973                  * waiting to be sent out; and the qdisc is not running -
2974                  * xmit the skb directly.
2975                  */
2976
2977                 qdisc_bstats_update(q, skb);
2978
2979                 if (sch_direct_xmit(skb, q, dev, txq, root_lock, true)) {
2980                         if (unlikely(contended)) {
2981                                 spin_unlock(&q->busylock);
2982                                 contended = false;
2983                         }
2984                         __qdisc_run(q);
2985                 } else
2986                         qdisc_run_end(q);
2987
2988                 rc = NET_XMIT_SUCCESS;
2989         } else {
2990                 rc = q->enqueue(skb, q) & NET_XMIT_MASK;
2991                 if (qdisc_run_begin(q)) {
2992                         if (unlikely(contended)) {
2993                                 spin_unlock(&q->busylock);
2994                                 contended = false;
2995                         }
2996                         __qdisc_run(q);
2997                 }
2998         }
2999         spin_unlock(root_lock);
3000         if (unlikely(contended))
3001                 spin_unlock(&q->busylock);
3002         return rc;
3003 }
3004
3005 #if IS_ENABLED(CONFIG_CGROUP_NET_PRIO)
3006 static void skb_update_prio(struct sk_buff *skb)
3007 {
3008         struct netprio_map *map = rcu_dereference_bh(skb->dev->priomap);
3009
3010         if (!skb->priority && skb->sk && map) {
3011                 unsigned int prioidx = skb->sk->sk_cgrp_prioidx;
3012
3013                 if (prioidx < map->priomap_len)
3014                         skb->priority = map->priomap[prioidx];
3015         }
3016 }
3017 #else
3018 #define skb_update_prio(skb)
3019 #endif
3020
3021 DEFINE_PER_CPU(int, xmit_recursion);
3022 EXPORT_SYMBOL(xmit_recursion);
3023
3024 #define RECURSION_LIMIT 10
3025
3026 /**
3027  *      dev_loopback_xmit - loop back @skb
3028  *      @net: network namespace this loopback is happening in
3029  *      @sk:  sk needed to be a netfilter okfn
3030  *      @skb: buffer to transmit
3031  */
3032 int dev_loopback_xmit(struct net *net, struct sock *sk, struct sk_buff *skb)
3033 {
3034         skb_reset_mac_header(skb);
3035         __skb_pull(skb, skb_network_offset(skb));
3036         skb->pkt_type = PACKET_LOOPBACK;
3037         skb->ip_summed = CHECKSUM_UNNECESSARY;
3038         WARN_ON(!skb_dst(skb));
3039         skb_dst_force(skb);
3040         netif_rx_ni(skb);
3041         return 0;
3042 }
3043 EXPORT_SYMBOL(dev_loopback_xmit);
3044
3045 static inline int get_xps_queue(struct net_device *dev, struct sk_buff *skb)
3046 {
3047 #ifdef CONFIG_XPS
3048         struct xps_dev_maps *dev_maps;
3049         struct xps_map *map;
3050         int queue_index = -1;
3051
3052         rcu_read_lock();
3053         dev_maps = rcu_dereference(dev->xps_maps);
3054         if (dev_maps) {
3055                 map = rcu_dereference(
3056                     dev_maps->cpu_map[skb->sender_cpu - 1]);
3057                 if (map) {
3058                         if (map->len == 1)
3059                                 queue_index = map->queues[0];
3060                         else
3061                                 queue_index = map->queues[reciprocal_scale(skb_get_hash(skb),
3062                                                                            map->len)];
3063                         if (unlikely(queue_index >= dev->real_num_tx_queues))
3064                                 queue_index = -1;
3065                 }
3066         }
3067         rcu_read_unlock();
3068
3069         return queue_index;
3070 #else
3071         return -1;
3072 #endif
3073 }
3074
3075 static u16 __netdev_pick_tx(struct net_device *dev, struct sk_buff *skb)
3076 {
3077         struct sock *sk = skb->sk;
3078         int queue_index = sk_tx_queue_get(sk);
3079
3080         if (queue_index < 0 || skb->ooo_okay ||
3081             queue_index >= dev->real_num_tx_queues) {
3082                 int new_index = get_xps_queue(dev, skb);
3083                 if (new_index < 0)
3084                         new_index = skb_tx_hash(dev, skb);
3085
3086                 if (queue_index != new_index && sk &&
3087                     sk_fullsock(sk) &&
3088                     rcu_access_pointer(sk->sk_dst_cache))
3089                         sk_tx_queue_set(sk, new_index);
3090
3091                 queue_index = new_index;
3092         }
3093
3094         return queue_index;
3095 }
3096
3097 struct netdev_queue *netdev_pick_tx(struct net_device *dev,
3098                                     struct sk_buff *skb,
3099                                     void *accel_priv)
3100 {
3101         int queue_index = 0;
3102
3103 #ifdef CONFIG_XPS
3104         u32 sender_cpu = skb->sender_cpu - 1;
3105
3106         if (sender_cpu >= (u32)NR_CPUS)
3107                 skb->sender_cpu = raw_smp_processor_id() + 1;
3108 #endif
3109
3110         if (dev->real_num_tx_queues != 1) {
3111                 const struct net_device_ops *ops = dev->netdev_ops;
3112                 if (ops->ndo_select_queue)
3113                         queue_index = ops->ndo_select_queue(dev, skb, accel_priv,
3114                                                             __netdev_pick_tx);
3115                 else
3116                         queue_index = __netdev_pick_tx(dev, skb);
3117
3118                 if (!accel_priv)
3119                         queue_index = netdev_cap_txqueue(dev, queue_index);
3120         }
3121
3122         skb_set_queue_mapping(skb, queue_index);
3123         return netdev_get_tx_queue(dev, queue_index);
3124 }
3125
3126 /**
3127  *      __dev_queue_xmit - transmit a buffer
3128  *      @skb: buffer to transmit
3129  *      @accel_priv: private data used for L2 forwarding offload
3130  *
3131  *      Queue a buffer for transmission to a network device. The caller must
3132  *      have set the device and priority and built the buffer before calling
3133  *      this function. The function can be called from an interrupt.
3134  *
3135  *      A negative errno code is returned on a failure. A success does not
3136  *      guarantee the frame will be transmitted as it may be dropped due
3137  *      to congestion or traffic shaping.
3138  *
3139  * -----------------------------------------------------------------------------------
3140  *      I notice this method can also return errors from the queue disciplines,
3141  *      including NET_XMIT_DROP, which is a positive value.  So, errors can also
3142  *      be positive.
3143  *
3144  *      Regardless of the return value, the skb is consumed, so it is currently
3145  *      difficult to retry a send to this method.  (You can bump the ref count
3146  *      before sending to hold a reference for retry if you are careful.)
3147  *
3148  *      When calling this method, interrupts MUST be enabled.  This is because
3149  *      the BH enable code must have IRQs enabled so that it will not deadlock.
3150  *          --BLG
3151  */
3152 static int __dev_queue_xmit(struct sk_buff *skb, void *accel_priv)
3153 {
3154         struct net_device *dev = skb->dev;
3155         struct netdev_queue *txq;
3156         struct Qdisc *q;
3157         int rc = -ENOMEM;
3158
3159         skb_reset_mac_header(skb);
3160
3161         if (unlikely(skb_shinfo(skb)->tx_flags & SKBTX_SCHED_TSTAMP))
3162                 __skb_tstamp_tx(skb, NULL, skb->sk, SCM_TSTAMP_SCHED);
3163
3164         /* Disable soft irqs for various locks below. Also
3165          * stops preemption for RCU.
3166          */
3167         rcu_read_lock_bh();
3168
3169         skb_update_prio(skb);
3170
3171         /* If device/qdisc don't need skb->dst, release it right now while
3172          * its hot in this cpu cache.
3173          */
3174         if (dev->priv_flags & IFF_XMIT_DST_RELEASE)
3175                 skb_dst_drop(skb);
3176         else
3177                 skb_dst_force(skb);
3178
3179 #ifdef CONFIG_NET_SWITCHDEV
3180         /* Don't forward if offload device already forwarded */
3181         if (skb->offload_fwd_mark &&
3182             skb->offload_fwd_mark == dev->offload_fwd_mark) {
3183                 consume_skb(skb);
3184                 rc = NET_XMIT_SUCCESS;
3185                 goto out;
3186         }
3187 #endif
3188
3189         txq = netdev_pick_tx(dev, skb, accel_priv);
3190         q = rcu_dereference_bh(txq->qdisc);
3191
3192 #ifdef CONFIG_NET_CLS_ACT
3193         skb->tc_verd = SET_TC_AT(skb->tc_verd, AT_EGRESS);
3194 #endif
3195         trace_net_dev_queue(skb);
3196         if (q->enqueue) {
3197                 rc = __dev_xmit_skb(skb, q, dev, txq);
3198                 goto out;
3199         }
3200
3201         /* The device has no queue. Common case for software devices:
3202            loopback, all the sorts of tunnels...
3203
3204            Really, it is unlikely that netif_tx_lock protection is necessary
3205            here.  (f.e. loopback and IP tunnels are clean ignoring statistics
3206            counters.)
3207            However, it is possible, that they rely on protection
3208            made by us here.
3209
3210            Check this and shot the lock. It is not prone from deadlocks.
3211            Either shot noqueue qdisc, it is even simpler 8)
3212          */
3213         if (dev->flags & IFF_UP) {
3214                 int cpu = smp_processor_id(); /* ok because BHs are off */
3215
3216                 if (txq->xmit_lock_owner != cpu) {
3217
3218                         if (__this_cpu_read(xmit_recursion) > RECURSION_LIMIT)
3219                                 goto recursion_alert;
3220
3221                         skb = validate_xmit_skb(skb, dev);
3222                         if (!skb)
3223                                 goto drop;
3224
3225                         HARD_TX_LOCK(dev, txq, cpu);
3226
3227                         if (!netif_xmit_stopped(txq)) {
3228                                 __this_cpu_inc(xmit_recursion);
3229                                 skb = dev_hard_start_xmit(skb, dev, txq, &rc);
3230                                 __this_cpu_dec(xmit_recursion);
3231                                 if (dev_xmit_complete(rc)) {
3232                                         HARD_TX_UNLOCK(dev, txq);
3233                                         goto out;
3234                                 }
3235                         }
3236                         HARD_TX_UNLOCK(dev, txq);
3237                         net_crit_ratelimited("Virtual device %s asks to queue packet!\n",
3238                                              dev->name);
3239                 } else {
3240                         /* Recursion is detected! It is possible,
3241                          * unfortunately
3242                          */
3243 recursion_alert:
3244                         net_crit_ratelimited("Dead loop on virtual device %s, fix it urgently!\n",
3245                                              dev->name);
3246                 }
3247         }
3248
3249         rc = -ENETDOWN;
3250 drop:
3251         rcu_read_unlock_bh();
3252
3253         atomic_long_inc(&dev->tx_dropped);
3254         kfree_skb_list(skb);
3255         return rc;
3256 out:
3257         rcu_read_unlock_bh();
3258         return rc;
3259 }
3260
3261 int dev_queue_xmit(struct sk_buff *skb)
3262 {
3263         return __dev_queue_xmit(skb, NULL);
3264 }
3265 EXPORT_SYMBOL(dev_queue_xmit);
3266
3267 int dev_queue_xmit_accel(struct sk_buff *skb, void *accel_priv)
3268 {
3269         return __dev_queue_xmit(skb, accel_priv);
3270 }
3271 EXPORT_SYMBOL(dev_queue_xmit_accel);
3272
3273
3274 /*=======================================================================
3275                         Receiver routines
3276   =======================================================================*/
3277
3278 int netdev_max_backlog __read_mostly = 1000;
3279 EXPORT_SYMBOL(netdev_max_backlog);
3280
3281 int netdev_tstamp_prequeue __read_mostly = 1;
3282 int netdev_budget __read_mostly = 300;
3283 int weight_p __read_mostly = 64;            /* old backlog weight */
3284
3285 /* Called with irq disabled */
3286 static inline void ____napi_schedule(struct softnet_data *sd,
3287                                      struct napi_struct *napi)
3288 {
3289         list_add_tail(&napi->poll_list, &sd->poll_list);
3290         __raise_softirq_irqoff(NET_RX_SOFTIRQ);
3291 }
3292
3293 #ifdef CONFIG_RPS
3294
3295 /* One global table that all flow-based protocols share. */
3296 struct rps_sock_flow_table __rcu *rps_sock_flow_table __read_mostly;
3297 EXPORT_SYMBOL(rps_sock_flow_table);
3298 u32 rps_cpu_mask __read_mostly;
3299 EXPORT_SYMBOL(rps_cpu_mask);
3300
3301 struct static_key rps_needed __read_mostly;
3302
3303 static struct rps_dev_flow *
3304 set_rps_cpu(struct net_device *dev, struct sk_buff *skb,
3305             struct rps_dev_flow *rflow, u16 next_cpu)
3306 {
3307         if (next_cpu < nr_cpu_ids) {
3308 #ifdef CONFIG_RFS_ACCEL
3309                 struct netdev_rx_queue *rxqueue;
3310                 struct rps_dev_flow_table *flow_table;
3311                 struct rps_dev_flow *old_rflow;
3312                 u32 flow_id;
3313                 u16 rxq_index;
3314                 int rc;
3315
3316                 /* Should we steer this flow to a different hardware queue? */
3317                 if (!skb_rx_queue_recorded(skb) || !dev->rx_cpu_rmap ||
3318                     !(dev->features & NETIF_F_NTUPLE))
3319                         goto out;
3320                 rxq_index = cpu_rmap_lookup_index(dev->rx_cpu_rmap, next_cpu);
3321                 if (rxq_index == skb_get_rx_queue(skb))
3322                         goto out;
3323
3324                 rxqueue = dev->_rx + rxq_index;
3325                 flow_table = rcu_dereference(rxqueue->rps_flow_table);
3326                 if (!flow_table)
3327                         goto out;
3328                 flow_id = skb_get_hash(skb) & flow_table->mask;
3329                 rc = dev->netdev_ops->ndo_rx_flow_steer(dev, skb,
3330                                                         rxq_index, flow_id);
3331                 if (rc < 0)
3332                         goto out;
3333                 old_rflow = rflow;
3334                 rflow = &flow_table->flows[flow_id];
3335                 rflow->filter = rc;
3336                 if (old_rflow->filter == rflow->filter)
3337                         old_rflow->filter = RPS_NO_FILTER;
3338         out:
3339 #endif
3340                 rflow->last_qtail =
3341                         per_cpu(softnet_data, next_cpu).input_queue_head;
3342         }
3343
3344         rflow->cpu = next_cpu;
3345         return rflow;
3346 }
3347
3348 /*
3349  * get_rps_cpu is called from netif_receive_skb and returns the target
3350  * CPU from the RPS map of the receiving queue for a given skb.
3351  * rcu_read_lock must be held on entry.
3352  */
3353 static int get_rps_cpu(struct net_device *dev, struct sk_buff *skb,
3354                        struct rps_dev_flow **rflowp)
3355 {
3356         const struct rps_sock_flow_table *sock_flow_table;
3357         struct netdev_rx_queue *rxqueue = dev->_rx;
3358         struct rps_dev_flow_table *flow_table;
3359         struct rps_map *map;
3360         int cpu = -1;
3361         u32 tcpu;
3362         u32 hash;
3363
3364         if (skb_rx_queue_recorded(skb)) {
3365                 u16 index = skb_get_rx_queue(skb);
3366
3367                 if (unlikely(index >= dev->real_num_rx_queues)) {
3368                         WARN_ONCE(dev->real_num_rx_queues > 1,
3369                                   "%s received packet on queue %u, but number "
3370                                   "of RX queues is %u\n",
3371                                   dev->name, index, dev->real_num_rx_queues);
3372                         goto done;
3373                 }
3374                 rxqueue += index;
3375         }
3376
3377         /* Avoid computing hash if RFS/RPS is not active for this rxqueue */
3378
3379         flow_table = rcu_dereference(rxqueue->rps_flow_table);
3380         map = rcu_dereference(rxqueue->rps_map);
3381         if (!flow_table && !map)
3382                 goto done;
3383
3384         skb_reset_network_header(skb);
3385         hash = skb_get_hash(skb);
3386         if (!hash)
3387                 goto done;
3388
3389         sock_flow_table = rcu_dereference(rps_sock_flow_table);
3390         if (flow_table && sock_flow_table) {
3391                 struct rps_dev_flow *rflow;
3392                 u32 next_cpu;
3393                 u32 ident;
3394
3395                 /* First check into global flow table if there is a match */
3396                 ident = sock_flow_table->ents[hash & sock_flow_table->mask];
3397                 if ((ident ^ hash) & ~rps_cpu_mask)
3398                         goto try_rps;
3399
3400                 next_cpu = ident & rps_cpu_mask;
3401
3402                 /* OK, now we know there is a match,
3403                  * we can look at the local (per receive queue) flow table
3404                  */
3405                 rflow = &flow_table->flows[hash & flow_table->mask];
3406                 tcpu = rflow->cpu;
3407
3408                 /*
3409                  * If the desired CPU (where last recvmsg was done) is
3410                  * different from current CPU (one in the rx-queue flow
3411                  * table entry), switch if one of the following holds:
3412                  *   - Current CPU is unset (>= nr_cpu_ids).
3413                  *   - Current CPU is offline.
3414                  *   - The current CPU's queue tail has advanced beyond the
3415                  *     last packet that was enqueued using this table entry.
3416                  *     This guarantees that all previous packets for the flow
3417                  *     have been dequeued, thus preserving in order delivery.
3418                  */
3419                 if (unlikely(tcpu != next_cpu) &&
3420                     (tcpu >= nr_cpu_ids || !cpu_online(tcpu) ||
3421                      ((int)(per_cpu(softnet_data, tcpu).input_queue_head -
3422                       rflow->last_qtail)) >= 0)) {
3423                         tcpu = next_cpu;
3424                         rflow = set_rps_cpu(dev, skb, rflow, next_cpu);
3425                 }
3426
3427                 if (tcpu < nr_cpu_ids && cpu_online(tcpu)) {
3428                         *rflowp = rflow;
3429                         cpu = tcpu;
3430                         goto done;
3431                 }
3432         }
3433
3434 try_rps:
3435
3436         if (map) {
3437                 tcpu = map->cpus[reciprocal_scale(hash, map->len)];
3438                 if (cpu_online(tcpu)) {
3439                         cpu = tcpu;
3440                         goto done;
3441                 }
3442         }
3443
3444 done:
3445         return cpu;
3446 }
3447
3448 #ifdef CONFIG_RFS_ACCEL
3449
3450 /**
3451  * rps_may_expire_flow - check whether an RFS hardware filter may be removed
3452  * @dev: Device on which the filter was set
3453  * @rxq_index: RX queue index
3454  * @flow_id: Flow ID passed to ndo_rx_flow_steer()
3455  * @filter_id: Filter ID returned by ndo_rx_flow_steer()
3456  *
3457  * Drivers that implement ndo_rx_flow_steer() should periodically call
3458  * this function for each installed filter and remove the filters for
3459  * which it returns %true.
3460  */
3461 bool rps_may_expire_flow(struct net_device *dev, u16 rxq_index,
3462                          u32 flow_id, u16 filter_id)
3463 {
3464         struct netdev_rx_queue *rxqueue = dev->_rx + rxq_index;
3465         struct rps_dev_flow_table *flow_table;
3466         struct rps_dev_flow *rflow;
3467         bool expire = true;
3468         unsigned int cpu;
3469
3470         rcu_read_lock();
3471         flow_table = rcu_dereference(rxqueue->rps_flow_table);
3472         if (flow_table && flow_id <= flow_table->mask) {
3473                 rflow = &flow_table->flows[flow_id];
3474                 cpu = ACCESS_ONCE(rflow->cpu);
3475                 if (rflow->filter == filter_id && cpu < nr_cpu_ids &&
3476                     ((int)(per_cpu(softnet_data, cpu).input_queue_head -
3477                            rflow->last_qtail) <
3478                      (int)(10 * flow_table->mask)))
3479                         expire = false;
3480         }
3481         rcu_read_unlock();
3482         return expire;
3483 }
3484 EXPORT_SYMBOL(rps_may_expire_flow);
3485
3486 #endif /* CONFIG_RFS_ACCEL */
3487
3488 /* Called from hardirq (IPI) context */
3489 static void rps_trigger_softirq(void *data)
3490 {
3491         struct softnet_data *sd = data;
3492
3493         ____napi_schedule(sd, &sd->backlog);
3494         sd->received_rps++;
3495 }
3496
3497 #endif /* CONFIG_RPS */
3498
3499 /*
3500  * Check if this softnet_data structure is another cpu one
3501  * If yes, queue it to our IPI list and return 1
3502  * If no, return 0
3503  */
3504 static int rps_ipi_queued(struct softnet_data *sd)
3505 {
3506 #ifdef CONFIG_RPS
3507         struct softnet_data *mysd = this_cpu_ptr(&softnet_data);
3508
3509         if (sd != mysd) {
3510                 sd->rps_ipi_next = mysd->rps_ipi_list;
3511                 mysd->rps_ipi_list = sd;
3512
3513                 __raise_softirq_irqoff(NET_RX_SOFTIRQ);
3514                 return 1;
3515         }
3516 #endif /* CONFIG_RPS */
3517         return 0;
3518 }
3519
3520 #ifdef CONFIG_NET_FLOW_LIMIT
3521 int netdev_flow_limit_table_len __read_mostly = (1 << 12);
3522 #endif
3523
3524 static bool skb_flow_limit(struct sk_buff *skb, unsigned int qlen)
3525 {
3526 #ifdef CONFIG_NET_FLOW_LIMIT
3527         struct sd_flow_limit *fl;
3528         struct softnet_data *sd;
3529         unsigned int old_flow, new_flow;
3530
3531         if (qlen < (netdev_max_backlog >> 1))
3532                 return false;
3533
3534         sd = this_cpu_ptr(&softnet_data);
3535
3536         rcu_read_lock();
3537         fl = rcu_dereference(sd->flow_limit);
3538         if (fl) {
3539                 new_flow = skb_get_hash(skb) & (fl->num_buckets - 1);
3540                 old_flow = fl->history[fl->history_head];
3541                 fl->history[fl->history_head] = new_flow;
3542
3543                 fl->history_head++;
3544                 fl->history_head &= FLOW_LIMIT_HISTORY - 1;
3545
3546                 if (likely(fl->buckets[old_flow]))
3547                         fl->buckets[old_flow]--;
3548
3549                 if (++fl->buckets[new_flow] > (FLOW_LIMIT_HISTORY >> 1)) {
3550                         fl->count++;
3551                         rcu_read_unlock();
3552                         return true;
3553                 }
3554         }
3555         rcu_read_unlock();
3556 #endif
3557         return false;
3558 }
3559
3560 /*
3561  * enqueue_to_backlog is called to queue an skb to a per CPU backlog
3562  * queue (may be a remote CPU queue).
3563  */
3564 static int enqueue_to_backlog(struct sk_buff *skb, int cpu,
3565                               unsigned int *qtail)
3566 {
3567         struct softnet_data *sd;
3568         unsigned long flags;
3569         unsigned int qlen;
3570
3571         sd = &per_cpu(softnet_data, cpu);
3572
3573         local_irq_save(flags);
3574
3575         rps_lock(sd);
3576         if (!netif_running(skb->dev))
3577                 goto drop;
3578         qlen = skb_queue_len(&sd->input_pkt_queue);
3579         if (qlen <= netdev_max_backlog && !skb_flow_limit(skb, qlen)) {
3580                 if (qlen) {
3581 enqueue:
3582                         __skb_queue_tail(&sd->input_pkt_queue, skb);
3583                         input_queue_tail_incr_save(sd, qtail);
3584                         rps_unlock(sd);
3585                         local_irq_restore(flags);
3586                         return NET_RX_SUCCESS;
3587                 }
3588
3589                 /* Schedule NAPI for backlog device
3590                  * We can use non atomic operation since we own the queue lock
3591                  */
3592                 if (!__test_and_set_bit(NAPI_STATE_SCHED, &sd->backlog.state)) {
3593                         if (!rps_ipi_queued(sd))
3594                                 ____napi_schedule(sd, &sd->backlog);
3595                 }
3596                 goto enqueue;
3597         }
3598
3599 drop:
3600         sd->dropped++;
3601         rps_unlock(sd);
3602
3603         local_irq_restore(flags);
3604
3605         atomic_long_inc(&skb->dev->rx_dropped);
3606         kfree_skb(skb);
3607         return NET_RX_DROP;
3608 }
3609
3610 static int netif_rx_internal(struct sk_buff *skb)
3611 {
3612         int ret;
3613
3614         net_timestamp_check(netdev_tstamp_prequeue, skb);
3615
3616         trace_netif_rx(skb);
3617 #ifdef CONFIG_RPS
3618         if (static_key_false(&rps_needed)) {
3619                 struct rps_dev_flow voidflow, *rflow = &voidflow;
3620                 int cpu;
3621
3622                 preempt_disable();
3623                 rcu_read_lock();
3624
3625                 cpu = get_rps_cpu(skb->dev, skb, &rflow);
3626                 if (cpu < 0)
3627                         cpu = smp_processor_id();
3628
3629                 ret = enqueue_to_backlog(skb, cpu, &rflow->last_qtail);
3630
3631                 rcu_read_unlock();
3632                 preempt_enable();
3633         } else
3634 #endif
3635         {
3636                 unsigned int qtail;
3637                 ret = enqueue_to_backlog(skb, get_cpu(), &qtail);
3638                 put_cpu();
3639         }
3640         return ret;
3641 }
3642
3643 /**
3644  *      netif_rx        -       post buffer to the network code
3645  *      @skb: buffer to post
3646  *
3647  *      This function receives a packet from a device driver and queues it for
3648  *      the upper (protocol) levels to process.  It always succeeds. The buffer
3649  *      may be dropped during processing for congestion control or by the
3650  *      protocol layers.
3651  *
3652  *      return values:
3653  *      NET_RX_SUCCESS  (no congestion)
3654  *      NET_RX_DROP     (packet was dropped)
3655  *
3656  */
3657
3658 int netif_rx(struct sk_buff *skb)
3659 {
3660         trace_netif_rx_entry(skb);
3661
3662         return netif_rx_internal(skb);
3663 }
3664 EXPORT_SYMBOL(netif_rx);
3665
3666 int netif_rx_ni(struct sk_buff *skb)
3667 {
3668         int err;
3669
3670         trace_netif_rx_ni_entry(skb);
3671
3672         preempt_disable();
3673         err = netif_rx_internal(skb);
3674         if (local_softirq_pending())
3675                 do_softirq();
3676         preempt_enable();
3677
3678         return err;
3679 }
3680 EXPORT_SYMBOL(netif_rx_ni);
3681
3682 static void net_tx_action(struct softirq_action *h)
3683 {
3684         struct softnet_data *sd = this_cpu_ptr(&softnet_data);
3685
3686         if (sd->completion_queue) {
3687                 struct sk_buff *clist;
3688
3689                 local_irq_disable();
3690                 clist = sd->completion_queue;
3691                 sd->completion_queue = NULL;
3692                 local_irq_enable();
3693
3694                 while (clist) {
3695                         struct sk_buff *skb = clist;
3696                         clist = clist->next;
3697
3698                         WARN_ON(atomic_read(&skb->users));
3699                         if (likely(get_kfree_skb_cb(skb)->reason == SKB_REASON_CONSUMED))
3700                                 trace_consume_skb(skb);
3701                         else
3702                                 trace_kfree_skb(skb, net_tx_action);
3703                         __kfree_skb(skb);
3704                 }
3705         }
3706
3707         if (sd->output_queue) {
3708                 struct Qdisc *head;
3709
3710                 local_irq_disable();
3711                 head = sd->output_queue;
3712                 sd->output_queue = NULL;
3713                 sd->output_queue_tailp = &sd->output_queue;
3714                 local_irq_enable();
3715
3716                 while (head) {
3717                         struct Qdisc *q = head;
3718                         spinlock_t *root_lock;
3719
3720                         head = head->next_sched;
3721
3722                         root_lock = qdisc_lock(q);
3723                         if (spin_trylock(root_lock)) {
3724                                 smp_mb__before_atomic();
3725                                 clear_bit(__QDISC_STATE_SCHED,
3726                                           &q->state);
3727                                 qdisc_run(q);
3728                                 spin_unlock(root_lock);
3729                         } else {
3730                                 if (!test_bit(__QDISC_STATE_DEACTIVATED,
3731                                               &q->state)) {
3732                                         __netif_reschedule(q);
3733                                 } else {
3734                                         smp_mb__before_atomic();
3735                                         clear_bit(__QDISC_STATE_SCHED,
3736                                                   &q->state);
3737                                 }
3738                         }
3739                 }
3740         }
3741 }
3742
3743 #if (defined(CONFIG_BRIDGE) || defined(CONFIG_BRIDGE_MODULE)) && \
3744     (defined(CONFIG_ATM_LANE) || defined(CONFIG_ATM_LANE_MODULE))
3745 /* This hook is defined here for ATM LANE */
3746 int (*br_fdb_test_addr_hook)(struct net_device *dev,
3747                              unsigned char *addr) __read_mostly;
3748 EXPORT_SYMBOL_GPL(br_fdb_test_addr_hook);
3749 #endif
3750
3751 static inline struct sk_buff *handle_ing(struct sk_buff *skb,
3752                                          struct packet_type **pt_prev,
3753                                          int *ret, struct net_device *orig_dev)
3754 {
3755 #ifdef CONFIG_NET_CLS_ACT
3756         struct tcf_proto *cl = rcu_dereference_bh(skb->dev->ingress_cl_list);
3757         struct tcf_result cl_res;
3758
3759         /* If there's at least one ingress present somewhere (so
3760          * we get here via enabled static key), remaining devices
3761          * that are not configured with an ingress qdisc will bail
3762          * out here.
3763          */
3764         if (!cl)
3765                 return skb;
3766         if (*pt_prev) {
3767                 *ret = deliver_skb(skb, *pt_prev, orig_dev);
3768                 *pt_prev = NULL;
3769         }
3770
3771         qdisc_skb_cb(skb)->pkt_len = skb->len;
3772         skb->tc_verd = SET_TC_AT(skb->tc_verd, AT_INGRESS);
3773         qdisc_bstats_cpu_update(cl->q, skb);
3774
3775         switch (tc_classify(skb, cl, &cl_res, false)) {
3776         case TC_ACT_OK:
3777         case TC_ACT_RECLASSIFY:
3778                 skb->tc_index = TC_H_MIN(cl_res.classid);
3779                 break;
3780         case TC_ACT_SHOT:
3781                 qdisc_qstats_cpu_drop(cl->q);
3782         case TC_ACT_STOLEN:
3783         case TC_ACT_QUEUED:
3784                 kfree_skb(skb);
3785                 return NULL;
3786         case TC_ACT_REDIRECT:
3787                 /* skb_mac_header check was done by cls/act_bpf, so
3788                  * we can safely push the L2 header back before
3789                  * redirecting to another netdev
3790                  */
3791                 __skb_push(skb, skb->mac_len);
3792                 skb_do_redirect(skb);
3793                 return NULL;
3794         default:
3795                 break;
3796         }
3797 #endif /* CONFIG_NET_CLS_ACT */
3798         return skb;
3799 }
3800
3801 /**
3802  *      netdev_is_rx_handler_busy - check if receive handler is registered
3803  *      @dev: device to check
3804  *
3805  *      Check if a receive handler is already registered for a given device.
3806  *      Return true if there one.
3807  *
3808  *      The caller must hold the rtnl_mutex.
3809  */
3810 bool netdev_is_rx_handler_busy(struct net_device *dev)
3811 {
3812         ASSERT_RTNL();
3813         return dev && rtnl_dereference(dev->rx_handler);
3814 }
3815 EXPORT_SYMBOL_GPL(netdev_is_rx_handler_busy);
3816
3817 /**
3818  *      netdev_rx_handler_register - register receive handler
3819  *      @dev: device to register a handler for
3820  *      @rx_handler: receive handler to register
3821  *      @rx_handler_data: data pointer that is used by rx handler
3822  *
3823  *      Register a receive handler for a device. This handler will then be
3824  *      called from __netif_receive_skb. A negative errno code is returned
3825  *      on a failure.
3826  *
3827  *      The caller must hold the rtnl_mutex.
3828  *
3829  *      For a general description of rx_handler, see enum rx_handler_result.
3830  */
3831 int netdev_rx_handler_register(struct net_device *dev,
3832                                rx_handler_func_t *rx_handler,
3833                                void *rx_handler_data)
3834 {
3835         ASSERT_RTNL();
3836
3837         if (dev->rx_handler)
3838                 return -EBUSY;
3839
3840         /* Note: rx_handler_data must be set before rx_handler */
3841         rcu_assign_pointer(dev->rx_handler_data, rx_handler_data);
3842         rcu_assign_pointer(dev->rx_handler, rx_handler);
3843
3844         return 0;
3845 }
3846 EXPORT_SYMBOL_GPL(netdev_rx_handler_register);
3847
3848 /**
3849  *      netdev_rx_handler_unregister - unregister receive handler
3850  *      @dev: device to unregister a handler from
3851  *
3852  *      Unregister a receive handler from a device.
3853  *
3854  *      The caller must hold the rtnl_mutex.
3855  */
3856 void netdev_rx_handler_unregister(struct net_device *dev)
3857 {
3858
3859         ASSERT_RTNL();
3860         RCU_INIT_POINTER(dev->rx_handler, NULL);
3861         /* a reader seeing a non NULL rx_handler in a rcu_read_lock()
3862          * section has a guarantee to see a non NULL rx_handler_data
3863          * as well.
3864          */
3865         synchronize_net();
3866         RCU_INIT_POINTER(dev->rx_handler_data, NULL);
3867 }
3868 EXPORT_SYMBOL_GPL(netdev_rx_handler_unregister);
3869
3870 /*
3871  * Limit the use of PFMEMALLOC reserves to those protocols that implement
3872  * the special handling of PFMEMALLOC skbs.
3873  */
3874 static bool skb_pfmemalloc_protocol(struct sk_buff *skb)
3875 {
3876         switch (skb->protocol) {
3877         case htons(ETH_P_ARP):
3878         case htons(ETH_P_IP):
3879         case htons(ETH_P_IPV6):
3880         case htons(ETH_P_8021Q):
3881         case htons(ETH_P_8021AD):
3882                 return true;
3883         default:
3884                 return false;
3885         }
3886 }
3887
3888 static inline int nf_ingress(struct sk_buff *skb, struct packet_type **pt_prev,
3889                              int *ret, struct net_device *orig_dev)
3890 {
3891 #ifdef CONFIG_NETFILTER_INGRESS
3892         if (nf_hook_ingress_active(skb)) {
3893                 if (*pt_prev) {
3894                         *ret = deliver_skb(skb, *pt_prev, orig_dev);
3895                         *pt_prev = NULL;
3896                 }
3897
3898                 return nf_hook_ingress(skb);
3899         }
3900 #endif /* CONFIG_NETFILTER_INGRESS */
3901         return 0;
3902 }
3903
3904 static int __netif_receive_skb_core(struct sk_buff *skb, bool pfmemalloc)
3905 {
3906         struct packet_type *ptype, *pt_prev;
3907         rx_handler_func_t *rx_handler;
3908         struct net_device *orig_dev;
3909         bool deliver_exact = false;
3910         int ret = NET_RX_DROP;
3911         __be16 type;
3912
3913         net_timestamp_check(!netdev_tstamp_prequeue, skb);
3914
3915         trace_netif_receive_skb(skb);
3916
3917         orig_dev = skb->dev;
3918
3919         skb_reset_network_header(skb);
3920         if (!skb_transport_header_was_set(skb))
3921                 skb_reset_transport_header(skb);
3922         skb_reset_mac_len(skb);
3923
3924         pt_prev = NULL;
3925
3926 another_round:
3927         skb->skb_iif = skb->dev->ifindex;
3928
3929         __this_cpu_inc(softnet_data.processed);
3930
3931         if (skb->protocol == cpu_to_be16(ETH_P_8021Q) ||
3932             skb->protocol == cpu_to_be16(ETH_P_8021AD)) {
3933                 skb = skb_vlan_untag(skb);
3934                 if (unlikely(!skb))
3935                         goto out;
3936         }
3937
3938 #ifdef CONFIG_NET_CLS_ACT
3939         if (skb->tc_verd & TC_NCLS) {
3940                 skb->tc_verd = CLR_TC_NCLS(skb->tc_verd);
3941                 goto ncls;
3942         }
3943 #endif
3944
3945         if (pfmemalloc)
3946                 goto skip_taps;
3947
3948         list_for_each_entry_rcu(ptype, &ptype_all, list) {
3949                 if (pt_prev)
3950                         ret = deliver_skb(skb, pt_prev, orig_dev);
3951                 pt_prev = ptype;
3952         }
3953
3954         list_for_each_entry_rcu(ptype, &skb->dev->ptype_all, list) {
3955                 if (pt_prev)
3956                         ret = deliver_skb(skb, pt_prev, orig_dev);
3957                 pt_prev = ptype;
3958         }
3959
3960 skip_taps:
3961 #ifdef CONFIG_NET_INGRESS
3962         if (static_key_false(&ingress_needed)) {
3963                 skb = handle_ing(skb, &pt_prev, &ret, orig_dev);
3964                 if (!skb)
3965                         goto out;
3966
3967                 if (nf_ingress(skb, &pt_prev, &ret, orig_dev) < 0)
3968                         goto out;
3969         }
3970 #endif
3971 #ifdef CONFIG_NET_CLS_ACT
3972         skb->tc_verd = 0;
3973 ncls:
3974 #endif
3975         if (pfmemalloc && !skb_pfmemalloc_protocol(skb))
3976                 goto drop;
3977
3978         if (skb_vlan_tag_present(skb)) {
3979                 if (pt_prev) {
3980                         ret = deliver_skb(skb, pt_prev, orig_dev);
3981                         pt_prev = NULL;
3982                 }
3983                 if (vlan_do_receive(&skb))
3984                         goto another_round;
3985                 else if (unlikely(!skb))
3986                         goto out;
3987         }
3988
3989         rx_handler = rcu_dereference(skb->dev->rx_handler);
3990         if (rx_handler) {
3991                 if (pt_prev) {
3992                         ret = deliver_skb(skb, pt_prev, orig_dev);
3993                         pt_prev = NULL;
3994                 }
3995                 switch (rx_handler(&skb)) {
3996                 case RX_HANDLER_CONSUMED:
3997                         ret = NET_RX_SUCCESS;
3998                         goto out;
3999                 case RX_HANDLER_ANOTHER:
4000                         goto another_round;
4001                 case RX_HANDLER_EXACT:
4002                         deliver_exact = true;
4003                 case RX_HANDLER_PASS:
4004                         break;
4005                 default:
4006                         BUG();
4007                 }
4008         }
4009
4010         if (unlikely(skb_vlan_tag_present(skb))) {
4011                 if (skb_vlan_tag_get_id(skb))
4012                         skb->pkt_type = PACKET_OTHERHOST;
4013                 /* Note: we might in the future use prio bits
4014                  * and set skb->priority like in vlan_do_receive()
4015                  * For the time being, just ignore Priority Code Point
4016                  */
4017                 skb->vlan_tci = 0;
4018         }
4019
4020         type = skb->protocol;
4021
4022         /* deliver only exact match when indicated */
4023         if (likely(!deliver_exact)) {
4024                 deliver_ptype_list_skb(skb, &pt_prev, orig_dev, type,
4025                                        &ptype_base[ntohs(type) &
4026                                                    PTYPE_HASH_MASK]);
4027         }
4028
4029         deliver_ptype_list_skb(skb, &pt_prev, orig_dev, type,
4030                                &orig_dev->ptype_specific);
4031
4032         if (unlikely(skb->dev != orig_dev)) {
4033                 deliver_ptype_list_skb(skb, &pt_prev, orig_dev, type,
4034                                        &skb->dev->ptype_specific);
4035         }
4036
4037         if (pt_prev) {
4038                 if (unlikely(skb_orphan_frags(skb, GFP_ATOMIC)))
4039                         goto drop;
4040                 else
4041                         ret = pt_prev->func(skb, skb->dev, pt_prev, orig_dev);
4042         } else {
4043 drop:
4044                 atomic_long_inc(&skb->dev->rx_dropped);
4045                 kfree_skb(skb);
4046                 /* Jamal, now you will not able to escape explaining
4047                  * me how you were going to use this. :-)
4048                  */
4049                 ret = NET_RX_DROP;
4050         }
4051
4052 out:
4053         return ret;
4054 }
4055
4056 static int __netif_receive_skb(struct sk_buff *skb)
4057 {
4058         int ret;
4059
4060         if (sk_memalloc_socks() && skb_pfmemalloc(skb)) {
4061                 unsigned long pflags = current->flags;
4062
4063                 /*
4064                  * PFMEMALLOC skbs are special, they should
4065                  * - be delivered to SOCK_MEMALLOC sockets only
4066                  * - stay away from userspace
4067                  * - have bounded memory usage
4068                  *
4069                  * Use PF_MEMALLOC as this saves us from propagating the allocation
4070                  * context down to all allocation sites.
4071                  */
4072                 current->flags |= PF_MEMALLOC;
4073                 ret = __netif_receive_skb_core(skb, true);
4074                 tsk_restore_flags(current, pflags, PF_MEMALLOC);
4075         } else
4076                 ret = __netif_receive_skb_core(skb, false);
4077
4078         return ret;
4079 }
4080
4081 static int netif_receive_skb_internal(struct sk_buff *skb)
4082 {
4083         int ret;
4084
4085         net_timestamp_check(netdev_tstamp_prequeue, skb);
4086
4087         if (skb_defer_rx_timestamp(skb))
4088                 return NET_RX_SUCCESS;
4089
4090         rcu_read_lock();
4091
4092 #ifdef CONFIG_RPS
4093         if (static_key_false(&rps_needed)) {
4094                 struct rps_dev_flow voidflow, *rflow = &voidflow;
4095                 int cpu = get_rps_cpu(skb->dev, skb, &rflow);
4096
4097                 if (cpu >= 0) {
4098                         ret = enqueue_to_backlog(skb, cpu, &rflow->last_qtail);
4099                         rcu_read_unlock();
4100                         return ret;
4101                 }
4102         }
4103 #endif
4104         ret = __netif_receive_skb(skb);
4105         rcu_read_unlock();
4106         return ret;
4107 }
4108
4109 /**
4110  *      netif_receive_skb - process receive buffer from network
4111  *      @skb: buffer to process
4112  *
4113  *      netif_receive_skb() is the main receive data processing function.
4114  *      It always succeeds. The buffer may be dropped during processing
4115  *      for congestion control or by the protocol layers.
4116  *
4117  *      This function may only be called from softirq context and interrupts
4118  *      should be enabled.
4119  *
4120  *      Return values (usually ignored):
4121  *      NET_RX_SUCCESS: no congestion
4122  *      NET_RX_DROP: packet was dropped
4123  */
4124 int netif_receive_skb(struct sk_buff *skb)
4125 {
4126         trace_netif_receive_skb_entry(skb);
4127
4128         return netif_receive_skb_internal(skb);
4129 }
4130 EXPORT_SYMBOL(netif_receive_skb);
4131
4132 /* Network device is going away, flush any packets still pending
4133  * Called with irqs disabled.
4134  */
4135 static void flush_backlog(void *arg)
4136 {
4137         struct net_device *dev = arg;
4138         struct softnet_data *sd = this_cpu_ptr(&softnet_data);
4139         struct sk_buff *skb, *tmp;
4140
4141         rps_lock(sd);
4142         skb_queue_walk_safe(&sd->input_pkt_queue, skb, tmp) {
4143                 if (skb->dev == dev) {
4144                         __skb_unlink(skb, &sd->input_pkt_queue);
4145                         kfree_skb(skb);
4146                         input_queue_head_incr(sd);
4147                 }
4148         }
4149         rps_unlock(sd);
4150
4151         skb_queue_walk_safe(&sd->process_queue, skb, tmp) {
4152                 if (skb->dev == dev) {
4153                         __skb_unlink(skb, &sd->process_queue);
4154                         kfree_skb(skb);
4155                         input_queue_head_incr(sd);
4156                 }
4157         }
4158 }
4159
4160 static int napi_gro_complete(struct sk_buff *skb)
4161 {
4162         struct packet_offload *ptype;
4163         __be16 type = skb->protocol;
4164         struct list_head *head = &offload_base;
4165         int err = -ENOENT;
4166
4167         BUILD_BUG_ON(sizeof(struct napi_gro_cb) > sizeof(skb->cb));
4168
4169         if (NAPI_GRO_CB(skb)->count == 1) {
4170                 skb_shinfo(skb)->gso_size = 0;
4171                 goto out;
4172         }
4173
4174         rcu_read_lock();
4175         list_for_each_entry_rcu(ptype, head, list) {
4176                 if (ptype->type != type || !ptype->callbacks.gro_complete)
4177                         continue;
4178
4179                 err = ptype->callbacks.gro_complete(skb, 0);
4180                 break;
4181         }
4182         rcu_read_unlock();
4183
4184         if (err) {
4185                 WARN_ON(&ptype->list == head);
4186                 kfree_skb(skb);
4187                 return NET_RX_SUCCESS;
4188         }
4189
4190 out:
4191         return netif_receive_skb_internal(skb);
4192 }
4193
4194 /* napi->gro_list contains packets ordered by age.
4195  * youngest packets at the head of it.
4196  * Complete skbs in reverse order to reduce latencies.
4197  */
4198 void napi_gro_flush(struct napi_struct *napi, bool flush_old)
4199 {
4200         struct sk_buff *skb, *prev = NULL;
4201
4202         /* scan list and build reverse chain */
4203         for (skb = napi->gro_list; skb != NULL; skb = skb->next) {
4204                 skb->prev = prev;
4205                 prev = skb;
4206         }
4207
4208         for (skb = prev; skb; skb = prev) {
4209                 skb->next = NULL;
4210
4211                 if (flush_old && NAPI_GRO_CB(skb)->age == jiffies)
4212                         return;
4213
4214                 prev = skb->prev;
4215                 napi_gro_complete(skb);
4216                 napi->gro_count--;
4217         }
4218
4219         napi->gro_list = NULL;
4220 }
4221 EXPORT_SYMBOL(napi_gro_flush);
4222
4223 static void gro_list_prepare(struct napi_struct *napi, struct sk_buff *skb)
4224 {
4225         struct sk_buff *p;
4226         unsigned int maclen = skb->dev->hard_header_len;
4227         u32 hash = skb_get_hash_raw(skb);
4228
4229         for (p = napi->gro_list; p; p = p->next) {
4230                 unsigned long diffs;
4231
4232                 NAPI_GRO_CB(p)->flush = 0;
4233
4234                 if (hash != skb_get_hash_raw(p)) {
4235                         NAPI_GRO_CB(p)->same_flow = 0;
4236                         continue;
4237                 }
4238
4239                 diffs = (unsigned long)p->dev ^ (unsigned long)skb->dev;
4240                 diffs |= p->vlan_tci ^ skb->vlan_tci;
4241                 diffs |= skb_metadata_dst_cmp(p, skb);
4242                 if (maclen == ETH_HLEN)
4243                         diffs |= compare_ether_header(skb_mac_header(p),
4244                                                       skb_mac_header(skb));
4245                 else if (!diffs)
4246                         diffs = memcmp(skb_mac_header(p),
4247                                        skb_mac_header(skb),
4248                                        maclen);
4249                 NAPI_GRO_CB(p)->same_flow = !diffs;
4250         }
4251 }
4252
4253 static void skb_gro_reset_offset(struct sk_buff *skb)
4254 {
4255         const struct skb_shared_info *pinfo = skb_shinfo(skb);
4256         const skb_frag_t *frag0 = &pinfo->frags[0];
4257
4258         NAPI_GRO_CB(skb)->data_offset = 0;
4259         NAPI_GRO_CB(skb)->frag0 = NULL;
4260         NAPI_GRO_CB(skb)->frag0_len = 0;
4261
4262         if (skb_mac_header(skb) == skb_tail_pointer(skb) &&
4263             pinfo->nr_frags &&
4264             !PageHighMem(skb_frag_page(frag0))) {
4265                 NAPI_GRO_CB(skb)->frag0 = skb_frag_address(frag0);
4266                 NAPI_GRO_CB(skb)->frag0_len = min_t(unsigned int,
4267                                                     skb_frag_size(frag0),
4268                                                     skb->end - skb->tail);
4269         }
4270 }
4271
4272 static void gro_pull_from_frag0(struct sk_buff *skb, int grow)
4273 {
4274         struct skb_shared_info *pinfo = skb_shinfo(skb);
4275
4276         BUG_ON(skb->end - skb->tail < grow);
4277
4278         memcpy(skb_tail_pointer(skb), NAPI_GRO_CB(skb)->frag0, grow);
4279
4280         skb->data_len -= grow;
4281         skb->tail += grow;
4282
4283         pinfo->frags[0].page_offset += grow;
4284         skb_frag_size_sub(&pinfo->frags[0], grow);
4285
4286         if (unlikely(!skb_frag_size(&pinfo->frags[0]))) {
4287                 skb_frag_unref(skb, 0);
4288                 memmove(pinfo->frags, pinfo->frags + 1,
4289                         --pinfo->nr_frags * sizeof(pinfo->frags[0]));
4290         }
4291 }
4292
4293 static enum gro_result dev_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
4294 {
4295         struct sk_buff **pp = NULL;
4296         struct packet_offload *ptype;
4297         __be16 type = skb->protocol;
4298         struct list_head *head = &offload_base;
4299         int same_flow;
4300         enum gro_result ret;
4301         int grow;
4302
4303         if (!(skb->dev->features & NETIF_F_GRO))
4304                 goto normal;
4305
4306         if (skb_is_gso(skb) || skb_has_frag_list(skb) || skb->csum_bad)
4307                 goto normal;
4308
4309         gro_list_prepare(napi, skb);
4310
4311         rcu_read_lock();
4312         list_for_each_entry_rcu(ptype, head, list) {
4313                 if (ptype->type != type || !ptype->callbacks.gro_receive)
4314                         continue;
4315
4316                 skb_set_network_header(skb, skb_gro_offset(skb));
4317                 skb_reset_mac_len(skb);
4318                 NAPI_GRO_CB(skb)->same_flow = 0;
4319                 NAPI_GRO_CB(skb)->flush = 0;
4320                 NAPI_GRO_CB(skb)->free = 0;
4321                 NAPI_GRO_CB(skb)->encap_mark = 0;
4322                 NAPI_GRO_CB(skb)->recursion_counter = 0;
4323                 NAPI_GRO_CB(skb)->gro_remcsum_start = 0;
4324
4325                 /* Setup for GRO checksum validation */
4326                 switch (skb->ip_summed) {
4327                 case CHECKSUM_COMPLETE:
4328                         NAPI_GRO_CB(skb)->csum = skb->csum;
4329                         NAPI_GRO_CB(skb)->csum_valid = 1;
4330                         NAPI_GRO_CB(skb)->csum_cnt = 0;
4331                         break;
4332                 case CHECKSUM_UNNECESSARY:
4333                         NAPI_GRO_CB(skb)->csum_cnt = skb->csum_level + 1;
4334                         NAPI_GRO_CB(skb)->csum_valid = 0;
4335                         break;
4336                 default:
4337                         NAPI_GRO_CB(skb)->csum_cnt = 0;
4338                         NAPI_GRO_CB(skb)->csum_valid = 0;
4339                 }
4340
4341                 pp = ptype->callbacks.gro_receive(&napi->gro_list, skb);
4342                 break;
4343         }
4344         rcu_read_unlock();
4345
4346         if (&ptype->list == head)
4347                 goto normal;
4348
4349         same_flow = NAPI_GRO_CB(skb)->same_flow;
4350         ret = NAPI_GRO_CB(skb)->free ? GRO_MERGED_FREE : GRO_MERGED;
4351
4352         if (pp) {
4353                 struct sk_buff *nskb = *pp;
4354
4355                 *pp = nskb->next;
4356                 nskb->next = NULL;
4357                 napi_gro_complete(nskb);
4358                 napi->gro_count--;
4359         }
4360
4361         if (same_flow)
4362                 goto ok;
4363
4364         if (NAPI_GRO_CB(skb)->flush)
4365                 goto normal;
4366
4367         if (unlikely(napi->gro_count >= MAX_GRO_SKBS)) {
4368                 struct sk_buff *nskb = napi->gro_list;
4369
4370                 /* locate the end of the list to select the 'oldest' flow */
4371                 while (nskb->next) {
4372                         pp = &nskb->next;
4373                         nskb = *pp;
4374                 }
4375                 *pp = NULL;
4376                 nskb->next = NULL;
4377                 napi_gro_complete(nskb);
4378         } else {
4379                 napi->gro_count++;
4380         }
4381         NAPI_GRO_CB(skb)->count = 1;
4382         NAPI_GRO_CB(skb)->age = jiffies;
4383         NAPI_GRO_CB(skb)->last = skb;
4384         skb_shinfo(skb)->gso_size = skb_gro_len(skb);
4385         skb->next = napi->gro_list;
4386         napi->gro_list = skb;
4387         ret = GRO_HELD;
4388
4389 pull:
4390         grow = skb_gro_offset(skb) - skb_headlen(skb);
4391         if (grow > 0)
4392                 gro_pull_from_frag0(skb, grow);
4393 ok:
4394         return ret;
4395
4396 normal:
4397         ret = GRO_NORMAL;
4398         goto pull;
4399 }
4400
4401 struct packet_offload *gro_find_receive_by_type(__be16 type)
4402 {
4403         struct list_head *offload_head = &offload_base;
4404         struct packet_offload *ptype;
4405
4406         list_for_each_entry_rcu(ptype, offload_head, list) {
4407                 if (ptype->type != type || !ptype->callbacks.gro_receive)
4408                         continue;
4409                 return ptype;
4410         }
4411         return NULL;
4412 }
4413 EXPORT_SYMBOL(gro_find_receive_by_type);
4414
4415 struct packet_offload *gro_find_complete_by_type(__be16 type)
4416 {
4417         struct list_head *offload_head = &offload_base;
4418         struct packet_offload *ptype;
4419
4420         list_for_each_entry_rcu(ptype, offload_head, list) {
4421                 if (ptype->type != type || !ptype->callbacks.gro_complete)
4422                         continue;
4423                 return ptype;
4424         }
4425         return NULL;
4426 }
4427 EXPORT_SYMBOL(gro_find_complete_by_type);
4428
4429 static void napi_skb_free_stolen_head(struct sk_buff *skb)
4430 {
4431         skb_dst_drop(skb);
4432         kmem_cache_free(skbuff_head_cache, skb);
4433 }
4434
4435 static gro_result_t napi_skb_finish(gro_result_t ret, struct sk_buff *skb)
4436 {
4437         switch (ret) {
4438         case GRO_NORMAL:
4439                 if (netif_receive_skb_internal(skb))
4440                         ret = GRO_DROP;
4441                 break;
4442
4443         case GRO_DROP:
4444                 kfree_skb(skb);
4445                 break;
4446
4447         case GRO_MERGED_FREE:
4448                 if (NAPI_GRO_CB(skb)->free == NAPI_GRO_FREE_STOLEN_HEAD)
4449                         napi_skb_free_stolen_head(skb);
4450                 else
4451                         __kfree_skb(skb);
4452                 break;
4453
4454         case GRO_HELD:
4455         case GRO_MERGED:
4456                 break;
4457         }
4458
4459         return ret;
4460 }
4461
4462 gro_result_t napi_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
4463 {
4464         trace_napi_gro_receive_entry(skb);
4465
4466         skb_gro_reset_offset(skb);
4467
4468         return napi_skb_finish(dev_gro_receive(napi, skb), skb);
4469 }
4470 EXPORT_SYMBOL(napi_gro_receive);
4471
4472 static void napi_reuse_skb(struct napi_struct *napi, struct sk_buff *skb)
4473 {
4474         if (unlikely(skb->pfmemalloc)) {
4475                 consume_skb(skb);
4476                 return;
4477         }
4478         __skb_pull(skb, skb_headlen(skb));
4479         /* restore the reserve we had after netdev_alloc_skb_ip_align() */
4480         skb_reserve(skb, NET_SKB_PAD + NET_IP_ALIGN - skb_headroom(skb));
4481         skb->vlan_tci = 0;
4482         skb->dev = napi->dev;
4483         skb->skb_iif = 0;
4484
4485         /* eth_type_trans() assumes pkt_type is PACKET_HOST */
4486         skb->pkt_type = PACKET_HOST;
4487
4488         skb->encapsulation = 0;
4489         skb_shinfo(skb)->gso_type = 0;
4490         skb->truesize = SKB_TRUESIZE(skb_end_offset(skb));
4491
4492         napi->skb = skb;
4493 }
4494
4495 struct sk_buff *napi_get_frags(struct napi_struct *napi)
4496 {
4497         struct sk_buff *skb = napi->skb;
4498
4499         if (!skb) {
4500                 skb = napi_alloc_skb(napi, GRO_MAX_HEAD);
4501                 napi->skb = skb;
4502         }
4503         return skb;
4504 }
4505 EXPORT_SYMBOL(napi_get_frags);
4506
4507 static gro_result_t napi_frags_finish(struct napi_struct *napi,
4508                                       struct sk_buff *skb,
4509                                       gro_result_t ret)
4510 {
4511         switch (ret) {
4512         case GRO_NORMAL:
4513         case GRO_HELD:
4514                 __skb_push(skb, ETH_HLEN);
4515                 skb->protocol = eth_type_trans(skb, skb->dev);
4516                 if (ret == GRO_NORMAL && netif_receive_skb_internal(skb))
4517                         ret = GRO_DROP;
4518                 break;
4519
4520         case GRO_DROP:
4521                 napi_reuse_skb(napi, skb);
4522                 break;
4523
4524         case GRO_MERGED_FREE:
4525                 if (NAPI_GRO_CB(skb)->free == NAPI_GRO_FREE_STOLEN_HEAD)
4526                         napi_skb_free_stolen_head(skb);
4527                 else
4528                         napi_reuse_skb(napi, skb);
4529                 break;
4530
4531         case GRO_MERGED:
4532                 break;
4533         }
4534
4535         return ret;
4536 }
4537
4538 /* Upper GRO stack assumes network header starts at gro_offset=0
4539  * Drivers could call both napi_gro_frags() and napi_gro_receive()
4540  * We copy ethernet header into skb->data to have a common layout.
4541  */
4542 static struct sk_buff *napi_frags_skb(struct napi_struct *napi)
4543 {
4544         struct sk_buff *skb = napi->skb;
4545         const struct ethhdr *eth;
4546         unsigned int hlen = sizeof(*eth);
4547
4548         napi->skb = NULL;
4549
4550         skb_reset_mac_header(skb);
4551         skb_gro_reset_offset(skb);
4552
4553         eth = skb_gro_header_fast(skb, 0);
4554         if (unlikely(skb_gro_header_hard(skb, hlen))) {
4555                 eth = skb_gro_header_slow(skb, hlen, 0);
4556                 if (unlikely(!eth)) {
4557                         napi_reuse_skb(napi, skb);
4558                         return NULL;
4559                 }
4560         } else {
4561                 gro_pull_from_frag0(skb, hlen);
4562                 NAPI_GRO_CB(skb)->frag0 += hlen;
4563                 NAPI_GRO_CB(skb)->frag0_len -= hlen;
4564         }
4565         __skb_pull(skb, hlen);
4566
4567         /*
4568          * This works because the only protocols we care about don't require
4569          * special handling.
4570          * We'll fix it up properly in napi_frags_finish()
4571          */
4572         skb->protocol = eth->h_proto;
4573
4574         return skb;
4575 }
4576
4577 gro_result_t napi_gro_frags(struct napi_struct *napi)
4578 {
4579         struct sk_buff *skb = napi_frags_skb(napi);
4580
4581         if (!skb)
4582                 return GRO_DROP;
4583
4584         trace_napi_gro_frags_entry(skb);
4585
4586         return napi_frags_finish(napi, skb, dev_gro_receive(napi, skb));
4587 }
4588 EXPORT_SYMBOL(napi_gro_frags);
4589
4590 /* Compute the checksum from gro_offset and return the folded value
4591  * after adding in any pseudo checksum.
4592  */
4593 __sum16 __skb_gro_checksum_complete(struct sk_buff *skb)
4594 {
4595         __wsum wsum;
4596         __sum16 sum;
4597
4598         wsum = skb_checksum(skb, skb_gro_offset(skb), skb_gro_len(skb), 0);
4599
4600         /* NAPI_GRO_CB(skb)->csum holds pseudo checksum */
4601         sum = csum_fold(csum_add(NAPI_GRO_CB(skb)->csum, wsum));
4602         if (likely(!sum)) {
4603                 if (unlikely(skb->ip_summed == CHECKSUM_COMPLETE) &&
4604                     !skb->csum_complete_sw)
4605                         netdev_rx_csum_fault(skb->dev);
4606         }
4607
4608         NAPI_GRO_CB(skb)->csum = wsum;
4609         NAPI_GRO_CB(skb)->csum_valid = 1;
4610
4611         return sum;
4612 }
4613 EXPORT_SYMBOL(__skb_gro_checksum_complete);
4614
4615 /*
4616  * net_rps_action_and_irq_enable sends any pending IPI's for rps.
4617  * Note: called with local irq disabled, but exits with local irq enabled.
4618  */
4619 static void net_rps_action_and_irq_enable(struct softnet_data *sd)
4620 {
4621 #ifdef CONFIG_RPS
4622         struct softnet_data *remsd = sd->rps_ipi_list;
4623
4624         if (remsd) {
4625                 sd->rps_ipi_list = NULL;
4626
4627                 local_irq_enable();
4628
4629                 /* Send pending IPI's to kick RPS processing on remote cpus. */
4630                 while (remsd) {
4631                         struct softnet_data *next = remsd->rps_ipi_next;
4632
4633                         if (cpu_online(remsd->cpu))
4634                                 smp_call_function_single_async(remsd->cpu,
4635                                                            &remsd->csd);
4636                         remsd = next;
4637                 }
4638         } else
4639 #endif
4640                 local_irq_enable();
4641 }
4642
4643 static bool sd_has_rps_ipi_waiting(struct softnet_data *sd)
4644 {
4645 #ifdef CONFIG_RPS
4646         return sd->rps_ipi_list != NULL;
4647 #else
4648         return false;
4649 #endif
4650 }
4651
4652 static int process_backlog(struct napi_struct *napi, int quota)
4653 {
4654         int work = 0;
4655         struct softnet_data *sd = container_of(napi, struct softnet_data, backlog);
4656
4657         /* Check if we have pending ipi, its better to send them now,
4658          * not waiting net_rx_action() end.
4659          */
4660         if (sd_has_rps_ipi_waiting(sd)) {
4661                 local_irq_disable();
4662                 net_rps_action_and_irq_enable(sd);
4663         }
4664
4665         napi->weight = weight_p;
4666         local_irq_disable();
4667         while (1) {
4668                 struct sk_buff *skb;
4669
4670                 while ((skb = __skb_dequeue(&sd->process_queue))) {
4671                         rcu_read_lock();
4672                         local_irq_enable();
4673                         __netif_receive_skb(skb);
4674                         rcu_read_unlock();
4675                         local_irq_disable();
4676                         input_queue_head_incr(sd);
4677                         if (++work >= quota) {
4678                                 local_irq_enable();
4679                                 return work;
4680                         }
4681                 }
4682
4683                 rps_lock(sd);
4684                 if (skb_queue_empty(&sd->input_pkt_queue)) {
4685                         /*
4686                          * Inline a custom version of __napi_complete().
4687                          * only current cpu owns and manipulates this napi,
4688                          * and NAPI_STATE_SCHED is the only possible flag set
4689                          * on backlog.
4690                          * We can use a plain write instead of clear_bit(),
4691                          * and we dont need an smp_mb() memory barrier.
4692                          */
4693                         napi->state = 0;
4694                         rps_unlock(sd);
4695
4696                         break;
4697                 }
4698
4699                 skb_queue_splice_tail_init(&sd->input_pkt_queue,
4700                                            &sd->process_queue);
4701                 rps_unlock(sd);
4702         }
4703         local_irq_enable();
4704
4705         return work;
4706 }
4707
4708 /**
4709  * __napi_schedule - schedule for receive
4710  * @n: entry to schedule
4711  *
4712  * The entry's receive function will be scheduled to run.
4713  * Consider using __napi_schedule_irqoff() if hard irqs are masked.
4714  */
4715 void __napi_schedule(struct napi_struct *n)
4716 {
4717         unsigned long flags;
4718
4719         local_irq_save(flags);
4720         ____napi_schedule(this_cpu_ptr(&softnet_data), n);
4721         local_irq_restore(flags);
4722 }
4723 EXPORT_SYMBOL(__napi_schedule);
4724
4725 /**
4726  * __napi_schedule_irqoff - schedule for receive
4727  * @n: entry to schedule
4728  *
4729  * Variant of __napi_schedule() assuming hard irqs are masked
4730  */
4731 void __napi_schedule_irqoff(struct napi_struct *n)
4732 {
4733         ____napi_schedule(this_cpu_ptr(&softnet_data), n);
4734 }
4735 EXPORT_SYMBOL(__napi_schedule_irqoff);
4736
4737 void __napi_complete(struct napi_struct *n)
4738 {
4739         BUG_ON(!test_bit(NAPI_STATE_SCHED, &n->state));
4740
4741         list_del_init(&n->poll_list);
4742         smp_mb__before_atomic();
4743         clear_bit(NAPI_STATE_SCHED, &n->state);
4744 }
4745 EXPORT_SYMBOL(__napi_complete);
4746
4747 void napi_complete_done(struct napi_struct *n, int work_done)
4748 {
4749         unsigned long flags;
4750
4751         /*
4752          * don't let napi dequeue from the cpu poll list
4753          * just in case its running on a different cpu
4754          */
4755         if (unlikely(test_bit(NAPI_STATE_NPSVC, &n->state)))
4756                 return;
4757
4758         if (n->gro_list) {
4759                 unsigned long timeout = 0;
4760
4761                 if (work_done)
4762                         timeout = n->dev->gro_flush_timeout;
4763
4764                 if (timeout)
4765                         hrtimer_start(&n->timer, ns_to_ktime(timeout),
4766                                       HRTIMER_MODE_REL_PINNED);
4767                 else
4768                         napi_gro_flush(n, false);
4769         }
4770         if (likely(list_empty(&n->poll_list))) {
4771                 WARN_ON_ONCE(!test_and_clear_bit(NAPI_STATE_SCHED, &n->state));
4772         } else {
4773                 /* If n->poll_list is not empty, we need to mask irqs */
4774                 local_irq_save(flags);
4775                 __napi_complete(n);
4776                 local_irq_restore(flags);
4777         }
4778 }
4779 EXPORT_SYMBOL(napi_complete_done);
4780
4781 /* must be called under rcu_read_lock(), as we dont take a reference */
4782 struct napi_struct *napi_by_id(unsigned int napi_id)
4783 {
4784         unsigned int hash = napi_id % HASH_SIZE(napi_hash);
4785         struct napi_struct *napi;
4786
4787         hlist_for_each_entry_rcu(napi, &napi_hash[hash], napi_hash_node)
4788                 if (napi->napi_id == napi_id)
4789                         return napi;
4790
4791         return NULL;
4792 }
4793 EXPORT_SYMBOL_GPL(napi_by_id);
4794
4795 void napi_hash_add(struct napi_struct *napi)
4796 {
4797         if (test_and_set_bit(NAPI_STATE_HASHED, &napi->state))
4798                 return;
4799
4800         spin_lock(&napi_hash_lock);
4801
4802         /* 0..NR_CPUS+1 range is reserved for sender_cpu use */
4803         do {
4804                 if (unlikely(++napi_gen_id < NR_CPUS + 1))
4805                         napi_gen_id = NR_CPUS + 1;
4806         } while (napi_by_id(napi_gen_id));
4807         napi->napi_id = napi_gen_id;
4808
4809         hlist_add_head_rcu(&napi->napi_hash_node,
4810                            &napi_hash[napi->napi_id % HASH_SIZE(napi_hash)]);
4811
4812         spin_unlock(&napi_hash_lock);
4813 }
4814 EXPORT_SYMBOL_GPL(napi_hash_add);
4815
4816 /* Warning : caller is responsible to make sure rcu grace period
4817  * is respected before freeing memory containing @napi
4818  */
4819 void napi_hash_del(struct napi_struct *napi)
4820 {
4821         spin_lock(&napi_hash_lock);
4822
4823         if (test_and_clear_bit(NAPI_STATE_HASHED, &napi->state))
4824                 hlist_del_rcu(&napi->napi_hash_node);
4825
4826         spin_unlock(&napi_hash_lock);
4827 }
4828 EXPORT_SYMBOL_GPL(napi_hash_del);
4829
4830 static enum hrtimer_restart napi_watchdog(struct hrtimer *timer)
4831 {
4832         struct napi_struct *napi;
4833
4834         napi = container_of(timer, struct napi_struct, timer);
4835         if (napi->gro_list)
4836                 napi_schedule(napi);
4837
4838         return HRTIMER_NORESTART;
4839 }
4840
4841 void netif_napi_add(struct net_device *dev, struct napi_struct *napi,
4842                     int (*poll)(struct napi_struct *, int), int weight)
4843 {
4844         INIT_LIST_HEAD(&napi->poll_list);
4845         hrtimer_init(&napi->timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL_PINNED);
4846         napi->timer.function = napi_watchdog;
4847         napi->gro_count = 0;
4848         napi->gro_list = NULL;
4849         napi->skb = NULL;
4850         napi->poll = poll;
4851         if (weight > NAPI_POLL_WEIGHT)
4852                 pr_err_once("netif_napi_add() called with weight %d on device %s\n",
4853                             weight, dev->name);
4854         napi->weight = weight;
4855         list_add(&napi->dev_list, &dev->napi_list);
4856         napi->dev = dev;
4857 #ifdef CONFIG_NETPOLL
4858         spin_lock_init(&napi->poll_lock);
4859         napi->poll_owner = -1;
4860 #endif
4861         set_bit(NAPI_STATE_SCHED, &napi->state);
4862 }
4863 EXPORT_SYMBOL(netif_napi_add);
4864
4865 void napi_disable(struct napi_struct *n)
4866 {
4867         might_sleep();
4868         set_bit(NAPI_STATE_DISABLE, &n->state);
4869
4870         while (test_and_set_bit(NAPI_STATE_SCHED, &n->state))
4871                 msleep(1);
4872         while (test_and_set_bit(NAPI_STATE_NPSVC, &n->state))
4873                 msleep(1);
4874
4875         hrtimer_cancel(&n->timer);
4876
4877         clear_bit(NAPI_STATE_DISABLE, &n->state);
4878 }
4879 EXPORT_SYMBOL(napi_disable);
4880
4881 void netif_napi_del(struct napi_struct *napi)
4882 {
4883         list_del_init(&napi->dev_list);
4884         napi_free_frags(napi);
4885
4886         kfree_skb_list(napi->gro_list);
4887         napi->gro_list = NULL;
4888         napi->gro_count = 0;
4889 }
4890 EXPORT_SYMBOL(netif_napi_del);
4891
4892 static int napi_poll(struct napi_struct *n, struct list_head *repoll)
4893 {
4894         void *have;
4895         int work, weight;
4896
4897         list_del_init(&n->poll_list);
4898
4899         have = netpoll_poll_lock(n);
4900
4901         weight = n->weight;
4902
4903         /* This NAPI_STATE_SCHED test is for avoiding a race
4904          * with netpoll's poll_napi().  Only the entity which
4905          * obtains the lock and sees NAPI_STATE_SCHED set will
4906          * actually make the ->poll() call.  Therefore we avoid
4907          * accidentally calling ->poll() when NAPI is not scheduled.
4908          */
4909         work = 0;
4910         if (test_bit(NAPI_STATE_SCHED, &n->state)) {
4911                 work = n->poll(n, weight);
4912                 trace_napi_poll(n);
4913         }
4914
4915         WARN_ON_ONCE(work > weight);
4916
4917         if (likely(work < weight))
4918                 goto out_unlock;
4919
4920         /* Drivers must not modify the NAPI state if they
4921          * consume the entire weight.  In such cases this code
4922          * still "owns" the NAPI instance and therefore can
4923          * move the instance around on the list at-will.
4924          */
4925         if (unlikely(napi_disable_pending(n))) {
4926                 napi_complete(n);
4927                 goto out_unlock;
4928         }
4929
4930         if (n->gro_list) {
4931                 /* flush too old packets
4932                  * If HZ < 1000, flush all packets.
4933                  */
4934                 napi_gro_flush(n, HZ >= 1000);
4935         }
4936
4937         /* Some drivers may have called napi_schedule
4938          * prior to exhausting their budget.
4939          */
4940         if (unlikely(!list_empty(&n->poll_list))) {
4941                 pr_warn_once("%s: Budget exhausted after napi rescheduled\n",
4942                              n->dev ? n->dev->name : "backlog");
4943                 goto out_unlock;
4944         }
4945
4946         list_add_tail(&n->poll_list, repoll);
4947
4948 out_unlock:
4949         netpoll_poll_unlock(have);
4950
4951         return work;
4952 }
4953
4954 static void net_rx_action(struct softirq_action *h)
4955 {
4956         struct softnet_data *sd = this_cpu_ptr(&softnet_data);
4957         unsigned long time_limit = jiffies + 2;
4958         int budget = netdev_budget;
4959         LIST_HEAD(list);
4960         LIST_HEAD(repoll);
4961
4962         local_irq_disable();
4963         list_splice_init(&sd->poll_list, &list);
4964         local_irq_enable();
4965
4966         for (;;) {
4967                 struct napi_struct *n;
4968
4969                 if (list_empty(&list)) {
4970                         if (!sd_has_rps_ipi_waiting(sd) && list_empty(&repoll))
4971                                 return;
4972                         break;
4973                 }
4974
4975                 n = list_first_entry(&list, struct napi_struct, poll_list);
4976                 budget -= napi_poll(n, &repoll);
4977
4978                 /* If softirq window is exhausted then punt.
4979                  * Allow this to run for 2 jiffies since which will allow
4980                  * an average latency of 1.5/HZ.
4981                  */
4982                 if (unlikely(budget <= 0 ||
4983                              time_after_eq(jiffies, time_limit))) {
4984                         sd->time_squeeze++;
4985                         break;
4986                 }
4987         }
4988
4989         local_irq_disable();
4990
4991         list_splice_tail_init(&sd->poll_list, &list);
4992         list_splice_tail(&repoll, &list);
4993         list_splice(&list, &sd->poll_list);
4994         if (!list_empty(&sd->poll_list))
4995                 __raise_softirq_irqoff(NET_RX_SOFTIRQ);
4996
4997         net_rps_action_and_irq_enable(sd);
4998 }
4999
5000 struct netdev_adjacent {
5001         struct net_device *dev;
5002
5003         /* upper master flag, there can only be one master device per list */
5004         bool master;
5005
5006         /* counter for the number of times this device was added to us */
5007         u16 ref_nr;
5008
5009         /* private field for the users */
5010         void *private;
5011
5012         struct list_head list;
5013         struct rcu_head rcu;
5014 };
5015
5016 static struct netdev_adjacent *__netdev_find_adj(struct net_device *adj_dev,
5017                                                  struct list_head *adj_list)
5018 {
5019         struct netdev_adjacent *adj;
5020
5021         list_for_each_entry(adj, adj_list, list) {
5022                 if (adj->dev == adj_dev)
5023                         return adj;
5024         }
5025         return NULL;
5026 }
5027
5028 /**
5029  * netdev_has_upper_dev - Check if device is linked to an upper device
5030  * @dev: device
5031  * @upper_dev: upper device to check
5032  *
5033  * Find out if a device is linked to specified upper device and return true
5034  * in case it is. Note that this checks only immediate upper device,
5035  * not through a complete stack of devices. The caller must hold the RTNL lock.
5036  */
5037 bool netdev_has_upper_dev(struct net_device *dev,
5038                           struct net_device *upper_dev)
5039 {
5040         ASSERT_RTNL();
5041
5042         return __netdev_find_adj(upper_dev, &dev->all_adj_list.upper);
5043 }
5044 EXPORT_SYMBOL(netdev_has_upper_dev);
5045
5046 /**
5047  * netdev_has_any_upper_dev - Check if device is linked to some device
5048  * @dev: device
5049  *
5050  * Find out if a device is linked to an upper device and return true in case
5051  * it is. The caller must hold the RTNL lock.
5052  */
5053 static bool netdev_has_any_upper_dev(struct net_device *dev)
5054 {
5055         ASSERT_RTNL();
5056
5057         return !list_empty(&dev->all_adj_list.upper);
5058 }
5059
5060 /**
5061  * netdev_master_upper_dev_get - Get master upper device
5062  * @dev: device
5063  *
5064  * Find a master upper device and return pointer to it or NULL in case
5065  * it's not there. The caller must hold the RTNL lock.
5066  */
5067 struct net_device *netdev_master_upper_dev_get(struct net_device *dev)
5068 {
5069         struct netdev_adjacent *upper;
5070
5071         ASSERT_RTNL();
5072
5073         if (list_empty(&dev->adj_list.upper))
5074                 return NULL;
5075
5076         upper = list_first_entry(&dev->adj_list.upper,
5077                                  struct netdev_adjacent, list);
5078         if (likely(upper->master))
5079                 return upper->dev;
5080         return NULL;
5081 }
5082 EXPORT_SYMBOL(netdev_master_upper_dev_get);
5083
5084 void *netdev_adjacent_get_private(struct list_head *adj_list)
5085 {
5086         struct netdev_adjacent *adj;
5087
5088         adj = list_entry(adj_list, struct netdev_adjacent, list);
5089
5090         return adj->private;
5091 }
5092 EXPORT_SYMBOL(netdev_adjacent_get_private);
5093
5094 /**
5095  * netdev_upper_get_next_dev_rcu - Get the next dev from upper list
5096  * @dev: device
5097  * @iter: list_head ** of the current position
5098  *
5099  * Gets the next device from the dev's upper list, starting from iter
5100  * position. The caller must hold RCU read lock.
5101  */
5102 struct net_device *netdev_upper_get_next_dev_rcu(struct net_device *dev,
5103                                                  struct list_head **iter)
5104 {
5105         struct netdev_adjacent *upper;
5106
5107         WARN_ON_ONCE(!rcu_read_lock_held() && !lockdep_rtnl_is_held());
5108
5109         upper = list_entry_rcu((*iter)->next, struct netdev_adjacent, list);
5110
5111         if (&upper->list == &dev->adj_list.upper)
5112                 return NULL;
5113
5114         *iter = &upper->list;
5115
5116         return upper->dev;
5117 }
5118 EXPORT_SYMBOL(netdev_upper_get_next_dev_rcu);
5119
5120 /**
5121  * netdev_all_upper_get_next_dev_rcu - Get the next dev from upper list
5122  * @dev: device
5123  * @iter: list_head ** of the current position
5124  *
5125  * Gets the next device from the dev's upper list, starting from iter
5126  * position. The caller must hold RCU read lock.
5127  */
5128 struct net_device *netdev_all_upper_get_next_dev_rcu(struct net_device *dev,
5129                                                      struct list_head **iter)
5130 {
5131         struct netdev_adjacent *upper;
5132
5133         WARN_ON_ONCE(!rcu_read_lock_held() && !lockdep_rtnl_is_held());
5134
5135         upper = list_entry_rcu((*iter)->next, struct netdev_adjacent, list);
5136
5137         if (&upper->list == &dev->all_adj_list.upper)
5138                 return NULL;
5139
5140         *iter = &upper->list;
5141
5142         return upper->dev;
5143 }
5144 EXPORT_SYMBOL(netdev_all_upper_get_next_dev_rcu);
5145
5146 /**
5147  * netdev_lower_get_next_private - Get the next ->private from the
5148  *                                 lower neighbour list
5149  * @dev: device
5150  * @iter: list_head ** of the current position
5151  *
5152  * Gets the next netdev_adjacent->private from the dev's lower neighbour
5153  * list, starting from iter position. The caller must hold either hold the
5154  * RTNL lock or its own locking that guarantees that the neighbour lower
5155  * list will remain unchanged.
5156  */
5157 void *netdev_lower_get_next_private(struct net_device *dev,
5158                                     struct list_head **iter)
5159 {
5160         struct netdev_adjacent *lower;
5161
5162         lower = list_entry(*iter, struct netdev_adjacent, list);
5163
5164         if (&lower->list == &dev->adj_list.lower)
5165                 return NULL;
5166
5167         *iter = lower->list.next;
5168
5169         return lower->private;
5170 }
5171 EXPORT_SYMBOL(netdev_lower_get_next_private);
5172
5173 /**
5174  * netdev_lower_get_next_private_rcu - Get the next ->private from the
5175  *                                     lower neighbour list, RCU
5176  *                                     variant
5177  * @dev: device
5178  * @iter: list_head ** of the current position
5179  *
5180  * Gets the next netdev_adjacent->private from the dev's lower neighbour
5181  * list, starting from iter position. The caller must hold RCU read lock.
5182  */
5183 void *netdev_lower_get_next_private_rcu(struct net_device *dev,
5184                                         struct list_head **iter)
5185 {
5186         struct netdev_adjacent *lower;
5187
5188         WARN_ON_ONCE(!rcu_read_lock_held());
5189
5190         lower = list_entry_rcu((*iter)->next, struct netdev_adjacent, list);
5191
5192         if (&lower->list == &dev->adj_list.lower)
5193                 return NULL;
5194
5195         *iter = &lower->list;
5196
5197         return lower->private;
5198 }
5199 EXPORT_SYMBOL(netdev_lower_get_next_private_rcu);
5200
5201 /**
5202  * netdev_lower_get_next - Get the next device from the lower neighbour
5203  *                         list
5204  * @dev: device
5205  * @iter: list_head ** of the current position
5206  *
5207  * Gets the next netdev_adjacent from the dev's lower neighbour
5208  * list, starting from iter position. The caller must hold RTNL lock or
5209  * its own locking that guarantees that the neighbour lower
5210  * list will remain unchanged.
5211  */
5212 void *netdev_lower_get_next(struct net_device *dev, struct list_head **iter)
5213 {
5214         struct netdev_adjacent *lower;
5215
5216         lower = list_entry((*iter)->next, struct netdev_adjacent, list);
5217
5218         if (&lower->list == &dev->adj_list.lower)
5219                 return NULL;
5220
5221         *iter = &lower->list;
5222
5223         return lower->dev;
5224 }
5225 EXPORT_SYMBOL(netdev_lower_get_next);
5226
5227 /**
5228  * netdev_lower_get_first_private_rcu - Get the first ->private from the
5229  *                                     lower neighbour list, RCU
5230  *                                     variant
5231  * @dev: device
5232  *
5233  * Gets the first netdev_adjacent->private from the dev's lower neighbour
5234  * list. The caller must hold RCU read lock.
5235  */
5236 void *netdev_lower_get_first_private_rcu(struct net_device *dev)
5237 {
5238         struct netdev_adjacent *lower;
5239
5240         lower = list_first_or_null_rcu(&dev->adj_list.lower,
5241                         struct netdev_adjacent, list);
5242         if (lower)
5243                 return lower->private;
5244         return NULL;
5245 }
5246 EXPORT_SYMBOL(netdev_lower_get_first_private_rcu);
5247
5248 /**
5249  * netdev_master_upper_dev_get_rcu - Get master upper device
5250  * @dev: device
5251  *
5252  * Find a master upper device and return pointer to it or NULL in case
5253  * it's not there. The caller must hold the RCU read lock.
5254  */
5255 struct net_device *netdev_master_upper_dev_get_rcu(struct net_device *dev)
5256 {
5257         struct netdev_adjacent *upper;
5258
5259         upper = list_first_or_null_rcu(&dev->adj_list.upper,
5260                                        struct netdev_adjacent, list);
5261         if (upper && likely(upper->master))
5262                 return upper->dev;
5263         return NULL;
5264 }
5265 EXPORT_SYMBOL(netdev_master_upper_dev_get_rcu);
5266
5267 static int netdev_adjacent_sysfs_add(struct net_device *dev,
5268                               struct net_device *adj_dev,
5269                               struct list_head *dev_list)
5270 {
5271         char linkname[IFNAMSIZ+7];
5272         sprintf(linkname, dev_list == &dev->adj_list.upper ?
5273                 "upper_%s" : "lower_%s", adj_dev->name);
5274         return sysfs_create_link(&(dev->dev.kobj), &(adj_dev->dev.kobj),
5275                                  linkname);
5276 }
5277 static void netdev_adjacent_sysfs_del(struct net_device *dev,
5278                                char *name,
5279                                struct list_head *dev_list)
5280 {
5281         char linkname[IFNAMSIZ+7];
5282         sprintf(linkname, dev_list == &dev->adj_list.upper ?
5283                 "upper_%s" : "lower_%s", name);
5284         sysfs_remove_link(&(dev->dev.kobj), linkname);
5285 }
5286
5287 static inline bool netdev_adjacent_is_neigh_list(struct net_device *dev,
5288                                                  struct net_device *adj_dev,
5289                                                  struct list_head *dev_list)
5290 {
5291         return (dev_list == &dev->adj_list.upper ||
5292                 dev_list == &dev->adj_list.lower) &&
5293                 net_eq(dev_net(dev), dev_net(adj_dev));
5294 }
5295
5296 static int __netdev_adjacent_dev_insert(struct net_device *dev,
5297                                         struct net_device *adj_dev,
5298                                         u16 ref_nr,
5299                                         struct list_head *dev_list,
5300                                         void *private, bool master)
5301 {
5302         struct netdev_adjacent *adj;
5303         int ret;
5304
5305         adj = __netdev_find_adj(adj_dev, dev_list);
5306
5307         if (adj) {
5308                 adj->ref_nr += ref_nr;
5309                 return 0;
5310         }
5311
5312         adj = kmalloc(sizeof(*adj), GFP_KERNEL);
5313         if (!adj)
5314                 return -ENOMEM;
5315
5316         adj->dev = adj_dev;
5317         adj->master = master;
5318         adj->ref_nr = ref_nr;
5319         adj->private = private;
5320         dev_hold(adj_dev);
5321
5322         pr_debug("dev_hold for %s, because of link added from %s to %s\n",
5323                  adj_dev->name, dev->name, adj_dev->name);
5324
5325         if (netdev_adjacent_is_neigh_list(dev, adj_dev, dev_list)) {
5326                 ret = netdev_adjacent_sysfs_add(dev, adj_dev, dev_list);
5327                 if (ret)
5328                         goto free_adj;
5329         }
5330
5331         /* Ensure that master link is always the first item in list. */
5332         if (master) {
5333                 ret = sysfs_create_link(&(dev->dev.kobj),
5334                                         &(adj_dev->dev.kobj), "master");
5335                 if (ret)
5336                         goto remove_symlinks;
5337
5338                 list_add_rcu(&adj->list, dev_list);
5339         } else {
5340                 list_add_tail_rcu(&adj->list, dev_list);
5341         }
5342
5343         return 0;
5344
5345 remove_symlinks:
5346         if (netdev_adjacent_is_neigh_list(dev, adj_dev, dev_list))
5347                 netdev_adjacent_sysfs_del(dev, adj_dev->name, dev_list);
5348 free_adj:
5349         kfree(adj);
5350         dev_put(adj_dev);
5351
5352         return ret;
5353 }
5354
5355 static void __netdev_adjacent_dev_remove(struct net_device *dev,
5356                                          struct net_device *adj_dev,
5357                                          u16 ref_nr,
5358                                          struct list_head *dev_list)
5359 {
5360         struct netdev_adjacent *adj;
5361
5362         adj = __netdev_find_adj(adj_dev, dev_list);
5363
5364         if (!adj) {
5365                 pr_err("tried to remove device %s from %s\n",
5366                        dev->name, adj_dev->name);
5367                 BUG();
5368         }
5369
5370         if (adj->ref_nr > ref_nr) {
5371                 pr_debug("%s to %s ref_nr-%d = %d\n", dev->name, adj_dev->name,
5372                          ref_nr, adj->ref_nr-ref_nr);
5373                 adj->ref_nr -= ref_nr;
5374                 return;
5375         }
5376
5377         if (adj->master)
5378                 sysfs_remove_link(&(dev->dev.kobj), "master");
5379
5380         if (netdev_adjacent_is_neigh_list(dev, adj_dev, dev_list))
5381                 netdev_adjacent_sysfs_del(dev, adj_dev->name, dev_list);
5382
5383         list_del_rcu(&adj->list);
5384         pr_debug("dev_put for %s, because link removed from %s to %s\n",
5385                  adj_dev->name, dev->name, adj_dev->name);
5386         dev_put(adj_dev);
5387         kfree_rcu(adj, rcu);
5388 }
5389
5390 static int __netdev_adjacent_dev_link_lists(struct net_device *dev,
5391                                             struct net_device *upper_dev,
5392                                             u16 ref_nr,
5393                                             struct list_head *up_list,
5394                                             struct list_head *down_list,
5395                                             void *private, bool master)
5396 {
5397         int ret;
5398
5399         ret = __netdev_adjacent_dev_insert(dev, upper_dev, ref_nr, up_list,
5400                                            private, master);
5401         if (ret)
5402                 return ret;
5403
5404         ret = __netdev_adjacent_dev_insert(upper_dev, dev, ref_nr, down_list,
5405                                            private, false);
5406         if (ret) {
5407                 __netdev_adjacent_dev_remove(dev, upper_dev, ref_nr, up_list);
5408                 return ret;
5409         }
5410
5411         return 0;
5412 }
5413
5414 static int __netdev_adjacent_dev_link(struct net_device *dev,
5415                                       struct net_device *upper_dev,
5416                                       u16 ref_nr)
5417 {
5418         return __netdev_adjacent_dev_link_lists(dev, upper_dev, ref_nr,
5419                                                 &dev->all_adj_list.upper,
5420                                                 &upper_dev->all_adj_list.lower,
5421                                                 NULL, false);
5422 }
5423
5424 static void __netdev_adjacent_dev_unlink_lists(struct net_device *dev,
5425                                                struct net_device *upper_dev,
5426                                                u16 ref_nr,
5427                                                struct list_head *up_list,
5428                                                struct list_head *down_list)
5429 {
5430         __netdev_adjacent_dev_remove(dev, upper_dev, ref_nr, up_list);
5431         __netdev_adjacent_dev_remove(upper_dev, dev, ref_nr, down_list);
5432 }
5433
5434 static void __netdev_adjacent_dev_unlink(struct net_device *dev,
5435                                          struct net_device *upper_dev,
5436                                          u16 ref_nr)
5437 {
5438         __netdev_adjacent_dev_unlink_lists(dev, upper_dev, ref_nr,
5439                                            &dev->all_adj_list.upper,
5440                                            &upper_dev->all_adj_list.lower);
5441 }
5442
5443 static int __netdev_adjacent_dev_link_neighbour(struct net_device *dev,
5444                                                 struct net_device *upper_dev,
5445                                                 void *private, bool master)
5446 {
5447         int ret = __netdev_adjacent_dev_link(dev, upper_dev, 1);
5448
5449         if (ret)
5450                 return ret;
5451
5452         ret = __netdev_adjacent_dev_link_lists(dev, upper_dev, 1,
5453                                                &dev->adj_list.upper,
5454                                                &upper_dev->adj_list.lower,
5455                                                private, master);
5456         if (ret) {
5457                 __netdev_adjacent_dev_unlink(dev, upper_dev, 1);
5458                 return ret;
5459         }
5460
5461         return 0;
5462 }
5463
5464 static void __netdev_adjacent_dev_unlink_neighbour(struct net_device *dev,
5465                                                    struct net_device *upper_dev)
5466 {
5467         __netdev_adjacent_dev_unlink(dev, upper_dev, 1);
5468         __netdev_adjacent_dev_unlink_lists(dev, upper_dev, 1,
5469                                            &dev->adj_list.upper,
5470                                            &upper_dev->adj_list.lower);
5471 }
5472
5473 static int __netdev_upper_dev_link(struct net_device *dev,
5474                                    struct net_device *upper_dev, bool master,
5475                                    void *private)
5476 {
5477         struct netdev_notifier_changeupper_info changeupper_info;
5478         struct netdev_adjacent *i, *j, *to_i, *to_j;
5479         int ret = 0;
5480
5481         ASSERT_RTNL();
5482
5483         if (dev == upper_dev)
5484                 return -EBUSY;
5485
5486         /* To prevent loops, check if dev is not upper device to upper_dev. */
5487         if (__netdev_find_adj(dev, &upper_dev->all_adj_list.upper))
5488                 return -EBUSY;
5489
5490         if (__netdev_find_adj(upper_dev, &dev->adj_list.upper))
5491                 return -EEXIST;
5492
5493         if (master && netdev_master_upper_dev_get(dev))
5494                 return -EBUSY;
5495
5496         changeupper_info.upper_dev = upper_dev;
5497         changeupper_info.master = master;
5498         changeupper_info.linking = true;
5499
5500         ret = call_netdevice_notifiers_info(NETDEV_PRECHANGEUPPER, dev,
5501                                             &changeupper_info.info);
5502         ret = notifier_to_errno(ret);
5503         if (ret)
5504                 return ret;
5505
5506         ret = __netdev_adjacent_dev_link_neighbour(dev, upper_dev, private,
5507                                                    master);
5508         if (ret)
5509                 return ret;
5510
5511         /* Now that we linked these devs, make all the upper_dev's
5512          * all_adj_list.upper visible to every dev's all_adj_list.lower an
5513          * versa, and don't forget the devices itself. All of these
5514          * links are non-neighbours.
5515          */
5516         list_for_each_entry(i, &dev->all_adj_list.lower, list) {
5517                 list_for_each_entry(j, &upper_dev->all_adj_list.upper, list) {
5518                         pr_debug("Interlinking %s with %s, non-neighbour\n",
5519                                  i->dev->name, j->dev->name);
5520                         ret = __netdev_adjacent_dev_link(i->dev, j->dev, i->ref_nr);
5521                         if (ret)
5522                                 goto rollback_mesh;
5523                 }
5524         }
5525
5526         /* add dev to every upper_dev's upper device */
5527         list_for_each_entry(i, &upper_dev->all_adj_list.upper, list) {
5528                 pr_debug("linking %s's upper device %s with %s\n",
5529                          upper_dev->name, i->dev->name, dev->name);
5530                 ret = __netdev_adjacent_dev_link(dev, i->dev, i->ref_nr);
5531                 if (ret)
5532                         goto rollback_upper_mesh;
5533         }
5534
5535         /* add upper_dev to every dev's lower device */
5536         list_for_each_entry(i, &dev->all_adj_list.lower, list) {
5537                 pr_debug("linking %s's lower device %s with %s\n", dev->name,
5538                          i->dev->name, upper_dev->name);
5539                 ret = __netdev_adjacent_dev_link(i->dev, upper_dev, i->ref_nr);
5540                 if (ret)
5541                         goto rollback_lower_mesh;
5542         }
5543
5544         call_netdevice_notifiers_info(NETDEV_CHANGEUPPER, dev,
5545                                       &changeupper_info.info);
5546         return 0;
5547
5548 rollback_lower_mesh:
5549         to_i = i;
5550         list_for_each_entry(i, &dev->all_adj_list.lower, list) {
5551                 if (i == to_i)
5552                         break;
5553                 __netdev_adjacent_dev_unlink(i->dev, upper_dev, i->ref_nr);
5554         }
5555
5556         i = NULL;
5557
5558 rollback_upper_mesh:
5559         to_i = i;
5560         list_for_each_entry(i, &upper_dev->all_adj_list.upper, list) {
5561                 if (i == to_i)
5562                         break;
5563                 __netdev_adjacent_dev_unlink(dev, i->dev, i->ref_nr);
5564         }
5565
5566         i = j = NULL;
5567
5568 rollback_mesh:
5569         to_i = i;
5570         to_j = j;
5571         list_for_each_entry(i, &dev->all_adj_list.lower, list) {
5572                 list_for_each_entry(j, &upper_dev->all_adj_list.upper, list) {
5573                         if (i == to_i && j == to_j)
5574                                 break;
5575                         __netdev_adjacent_dev_unlink(i->dev, j->dev, i->ref_nr);
5576                 }
5577                 if (i == to_i)
5578                         break;
5579         }
5580
5581         __netdev_adjacent_dev_unlink_neighbour(dev, upper_dev);
5582
5583         return ret;
5584 }
5585
5586 /**
5587  * netdev_upper_dev_link - Add a link to the upper device
5588  * @dev: device
5589  * @upper_dev: new upper device
5590  *
5591  * Adds a link to device which is upper to this one. The caller must hold
5592  * the RTNL lock. On a failure a negative errno code is returned.
5593  * On success the reference counts are adjusted and the function
5594  * returns zero.
5595  */
5596 int netdev_upper_dev_link(struct net_device *dev,
5597                           struct net_device *upper_dev)
5598 {
5599         return __netdev_upper_dev_link(dev, upper_dev, false, NULL);
5600 }
5601 EXPORT_SYMBOL(netdev_upper_dev_link);
5602
5603 /**
5604  * netdev_master_upper_dev_link - Add a master link to the upper device
5605  * @dev: device
5606  * @upper_dev: new upper device
5607  *
5608  * Adds a link to device which is upper to this one. In this case, only
5609  * one master upper device can be linked, although other non-master devices
5610  * might be linked as well. The caller must hold the RTNL lock.
5611  * On a failure a negative errno code is returned. On success the reference
5612  * counts are adjusted and the function returns zero.
5613  */
5614 int netdev_master_upper_dev_link(struct net_device *dev,
5615                                  struct net_device *upper_dev)
5616 {
5617         return __netdev_upper_dev_link(dev, upper_dev, true, NULL);
5618 }
5619 EXPORT_SYMBOL(netdev_master_upper_dev_link);
5620
5621 int netdev_master_upper_dev_link_private(struct net_device *dev,
5622                                          struct net_device *upper_dev,
5623                                          void *private)
5624 {
5625         return __netdev_upper_dev_link(dev, upper_dev, true, private);
5626 }
5627 EXPORT_SYMBOL(netdev_master_upper_dev_link_private);
5628
5629 /**
5630  * netdev_upper_dev_unlink - Removes a link to upper device
5631  * @dev: device
5632  * @upper_dev: new upper device
5633  *
5634  * Removes a link to device which is upper to this one. The caller must hold
5635  * the RTNL lock.
5636  */
5637 void netdev_upper_dev_unlink(struct net_device *dev,
5638                              struct net_device *upper_dev)
5639 {
5640         struct netdev_notifier_changeupper_info changeupper_info;
5641         struct netdev_adjacent *i, *j;
5642         ASSERT_RTNL();
5643
5644         changeupper_info.upper_dev = upper_dev;
5645         changeupper_info.master = netdev_master_upper_dev_get(dev) == upper_dev;
5646         changeupper_info.linking = false;
5647
5648         call_netdevice_notifiers_info(NETDEV_PRECHANGEUPPER, dev,
5649                                       &changeupper_info.info);
5650
5651         __netdev_adjacent_dev_unlink_neighbour(dev, upper_dev);
5652
5653         /* Here is the tricky part. We must remove all dev's lower
5654          * devices from all upper_dev's upper devices and vice
5655          * versa, to maintain the graph relationship.
5656          */
5657         list_for_each_entry(i, &dev->all_adj_list.lower, list)
5658                 list_for_each_entry(j, &upper_dev->all_adj_list.upper, list)
5659                         __netdev_adjacent_dev_unlink(i->dev, j->dev, i->ref_nr);
5660
5661         /* remove also the devices itself from lower/upper device
5662          * list
5663          */
5664         list_for_each_entry(i, &dev->all_adj_list.lower, list)
5665                 __netdev_adjacent_dev_unlink(i->dev, upper_dev, i->ref_nr);
5666
5667         list_for_each_entry(i, &upper_dev->all_adj_list.upper, list)
5668                 __netdev_adjacent_dev_unlink(dev, i->dev, i->ref_nr);
5669
5670         call_netdevice_notifiers_info(NETDEV_CHANGEUPPER, dev,
5671                                       &changeupper_info.info);
5672 }
5673 EXPORT_SYMBOL(netdev_upper_dev_unlink);
5674
5675 /**
5676  * netdev_bonding_info_change - Dispatch event about slave change
5677  * @dev: device
5678  * @bonding_info: info to dispatch
5679  *
5680  * Send NETDEV_BONDING_INFO to netdev notifiers with info.
5681  * The caller must hold the RTNL lock.
5682  */
5683 void netdev_bonding_info_change(struct net_device *dev,
5684                                 struct netdev_bonding_info *bonding_info)
5685 {
5686         struct netdev_notifier_bonding_info     info;
5687
5688         memcpy(&info.bonding_info, bonding_info,
5689                sizeof(struct netdev_bonding_info));
5690         call_netdevice_notifiers_info(NETDEV_BONDING_INFO, dev,
5691                                       &info.info);
5692 }
5693 EXPORT_SYMBOL(netdev_bonding_info_change);
5694
5695 static void netdev_adjacent_add_links(struct net_device *dev)
5696 {
5697         struct netdev_adjacent *iter;
5698
5699         struct net *net = dev_net(dev);
5700
5701         list_for_each_entry(iter, &dev->adj_list.upper, list) {
5702                 if (!net_eq(net,dev_net(iter->dev)))
5703                         continue;
5704                 netdev_adjacent_sysfs_add(iter->dev, dev,
5705                                           &iter->dev->adj_list.lower);
5706                 netdev_adjacent_sysfs_add(dev, iter->dev,
5707                                           &dev->adj_list.upper);
5708         }
5709
5710         list_for_each_entry(iter, &dev->adj_list.lower, list) {
5711                 if (!net_eq(net,dev_net(iter->dev)))
5712                         continue;
5713                 netdev_adjacent_sysfs_add(iter->dev, dev,
5714                                           &iter->dev->adj_list.upper);
5715                 netdev_adjacent_sysfs_add(dev, iter->dev,
5716                                           &dev->adj_list.lower);
5717         }
5718 }
5719
5720 static void netdev_adjacent_del_links(struct net_device *dev)
5721 {
5722         struct netdev_adjacent *iter;
5723
5724         struct net *net = dev_net(dev);
5725
5726         list_for_each_entry(iter, &dev->adj_list.upper, list) {
5727                 if (!net_eq(net,dev_net(iter->dev)))
5728                         continue;
5729                 netdev_adjacent_sysfs_del(iter->dev, dev->name,
5730                                           &iter->dev->adj_list.lower);
5731                 netdev_adjacent_sysfs_del(dev, iter->dev->name,
5732                                           &dev->adj_list.upper);
5733         }
5734
5735         list_for_each_entry(iter, &dev->adj_list.lower, list) {
5736                 if (!net_eq(net,dev_net(iter->dev)))
5737                         continue;
5738                 netdev_adjacent_sysfs_del(iter->dev, dev->name,
5739                                           &iter->dev->adj_list.upper);
5740                 netdev_adjacent_sysfs_del(dev, iter->dev->name,
5741                                           &dev->adj_list.lower);
5742         }
5743 }
5744
5745 void netdev_adjacent_rename_links(struct net_device *dev, char *oldname)
5746 {
5747         struct netdev_adjacent *iter;
5748
5749         struct net *net = dev_net(dev);
5750
5751         list_for_each_entry(iter, &dev->adj_list.upper, list) {
5752                 if (!net_eq(net,dev_net(iter->dev)))
5753                         continue;
5754                 netdev_adjacent_sysfs_del(iter->dev, oldname,
5755                                           &iter->dev->adj_list.lower);
5756                 netdev_adjacent_sysfs_add(iter->dev, dev,
5757                                           &iter->dev->adj_list.lower);
5758         }
5759
5760         list_for_each_entry(iter, &dev->adj_list.lower, list) {
5761                 if (!net_eq(net,dev_net(iter->dev)))
5762                         continue;
5763                 netdev_adjacent_sysfs_del(iter->dev, oldname,
5764                                           &iter->dev->adj_list.upper);
5765                 netdev_adjacent_sysfs_add(iter->dev, dev,
5766                                           &iter->dev->adj_list.upper);
5767         }
5768 }
5769
5770 void *netdev_lower_dev_get_private(struct net_device *dev,
5771                                    struct net_device *lower_dev)
5772 {
5773         struct netdev_adjacent *lower;
5774
5775         if (!lower_dev)
5776                 return NULL;
5777         lower = __netdev_find_adj(lower_dev, &dev->adj_list.lower);
5778         if (!lower)
5779                 return NULL;
5780
5781         return lower->private;
5782 }
5783 EXPORT_SYMBOL(netdev_lower_dev_get_private);
5784
5785
5786 int dev_get_nest_level(struct net_device *dev,
5787                        bool (*type_check)(struct net_device *dev))
5788 {
5789         struct net_device *lower = NULL;
5790         struct list_head *iter;
5791         int max_nest = -1;
5792         int nest;
5793
5794         ASSERT_RTNL();
5795
5796         netdev_for_each_lower_dev(dev, lower, iter) {
5797                 nest = dev_get_nest_level(lower, type_check);
5798                 if (max_nest < nest)
5799                         max_nest = nest;
5800         }
5801
5802         if (type_check(dev))
5803                 max_nest++;
5804
5805         return max_nest;
5806 }
5807 EXPORT_SYMBOL(dev_get_nest_level);
5808
5809 static void dev_change_rx_flags(struct net_device *dev, int flags)
5810 {
5811         const struct net_device_ops *ops = dev->netdev_ops;
5812
5813         if (ops->ndo_change_rx_flags)
5814                 ops->ndo_change_rx_flags(dev, flags);
5815 }
5816
5817 static int __dev_set_promiscuity(struct net_device *dev, int inc, bool notify)
5818 {
5819         unsigned int old_flags = dev->flags;
5820         kuid_t uid;
5821         kgid_t gid;
5822
5823         ASSERT_RTNL();
5824
5825         dev->flags |= IFF_PROMISC;
5826         dev->promiscuity += inc;
5827         if (dev->promiscuity == 0) {
5828                 /*
5829                  * Avoid overflow.
5830                  * If inc causes overflow, untouch promisc and return error.
5831                  */
5832                 if (inc < 0)
5833                         dev->flags &= ~IFF_PROMISC;
5834                 else {
5835                         dev->promiscuity -= inc;
5836                         pr_warn("%s: promiscuity touches roof, set promiscuity failed. promiscuity feature of device might be broken.\n",
5837                                 dev->name);
5838                         return -EOVERFLOW;
5839                 }
5840         }
5841         if (dev->flags != old_flags) {
5842                 pr_info("device %s %s promiscuous mode\n",
5843                         dev->name,
5844                         dev->flags & IFF_PROMISC ? "entered" : "left");
5845                 if (audit_enabled) {
5846                         current_uid_gid(&uid, &gid);
5847                         audit_log(current->audit_context, GFP_ATOMIC,
5848                                 AUDIT_ANOM_PROMISCUOUS,
5849                                 "dev=%s prom=%d old_prom=%d auid=%u uid=%u gid=%u ses=%u",
5850                                 dev->name, (dev->flags & IFF_PROMISC),
5851                                 (old_flags & IFF_PROMISC),
5852                                 from_kuid(&init_user_ns, audit_get_loginuid(current)),
5853                                 from_kuid(&init_user_ns, uid),
5854                                 from_kgid(&init_user_ns, gid),
5855                                 audit_get_sessionid(current));
5856                 }
5857
5858                 dev_change_rx_flags(dev, IFF_PROMISC);
5859         }
5860         if (notify)
5861                 __dev_notify_flags(dev, old_flags, IFF_PROMISC);
5862         return 0;
5863 }
5864
5865 /**
5866  *      dev_set_promiscuity     - update promiscuity count on a device
5867  *      @dev: device
5868  *      @inc: modifier
5869  *
5870  *      Add or remove promiscuity from a device. While the count in the device
5871  *      remains above zero the interface remains promiscuous. Once it hits zero
5872  *      the device reverts back to normal filtering operation. A negative inc
5873  *      value is used to drop promiscuity on the device.
5874  *      Return 0 if successful or a negative errno code on error.
5875  */
5876 int dev_set_promiscuity(struct net_device *dev, int inc)
5877 {
5878         unsigned int old_flags = dev->flags;
5879         int err;
5880
5881         err = __dev_set_promiscuity(dev, inc, true);
5882         if (err < 0)
5883                 return err;
5884         if (dev->flags != old_flags)
5885                 dev_set_rx_mode(dev);
5886         return err;
5887 }
5888 EXPORT_SYMBOL(dev_set_promiscuity);
5889
5890 static int __dev_set_allmulti(struct net_device *dev, int inc, bool notify)
5891 {
5892         unsigned int old_flags = dev->flags, old_gflags = dev->gflags;
5893
5894         ASSERT_RTNL();
5895
5896         dev->flags |= IFF_ALLMULTI;
5897         dev->allmulti += inc;
5898         if (dev->allmulti == 0) {
5899                 /*
5900                  * Avoid overflow.
5901                  * If inc causes overflow, untouch allmulti and return error.
5902                  */
5903                 if (inc < 0)
5904                         dev->flags &= ~IFF_ALLMULTI;
5905                 else {
5906                         dev->allmulti -= inc;
5907                         pr_warn("%s: allmulti touches roof, set allmulti failed. allmulti feature of device might be broken.\n",
5908                                 dev->name);
5909                         return -EOVERFLOW;
5910                 }
5911         }
5912         if (dev->flags ^ old_flags) {
5913                 dev_change_rx_flags(dev, IFF_ALLMULTI);
5914                 dev_set_rx_mode(dev);
5915                 if (notify)
5916                         __dev_notify_flags(dev, old_flags,
5917                                            dev->gflags ^ old_gflags);
5918         }
5919         return 0;
5920 }
5921
5922 /**
5923  *      dev_set_allmulti        - update allmulti count on a device
5924  *      @dev: device
5925  *      @inc: modifier
5926  *
5927  *      Add or remove reception of all multicast frames to a device. While the
5928  *      count in the device remains above zero the interface remains listening
5929  *      to all interfaces. Once it hits zero the device reverts back to normal
5930  *      filtering operation. A negative @inc value is used to drop the counter
5931  *      when releasing a resource needing all multicasts.
5932  *      Return 0 if successful or a negative errno code on error.
5933  */
5934
5935 int dev_set_allmulti(struct net_device *dev, int inc)
5936 {
5937         return __dev_set_allmulti(dev, inc, true);
5938 }
5939 EXPORT_SYMBOL(dev_set_allmulti);
5940
5941 /*
5942  *      Upload unicast and multicast address lists to device and
5943  *      configure RX filtering. When the device doesn't support unicast
5944  *      filtering it is put in promiscuous mode while unicast addresses
5945  *      are present.
5946  */
5947 void __dev_set_rx_mode(struct net_device *dev)
5948 {
5949         const struct net_device_ops *ops = dev->netdev_ops;
5950
5951         /* dev_open will call this function so the list will stay sane. */
5952         if (!(dev->flags&IFF_UP))
5953                 return;
5954
5955         if (!netif_device_present(dev))
5956                 return;
5957
5958         if (!(dev->priv_flags & IFF_UNICAST_FLT)) {
5959                 /* Unicast addresses changes may only happen under the rtnl,
5960                  * therefore calling __dev_set_promiscuity here is safe.
5961                  */
5962                 if (!netdev_uc_empty(dev) && !dev->uc_promisc) {
5963                         __dev_set_promiscuity(dev, 1, false);
5964                         dev->uc_promisc = true;
5965                 } else if (netdev_uc_empty(dev) && dev->uc_promisc) {
5966                         __dev_set_promiscuity(dev, -1, false);
5967                         dev->uc_promisc = false;
5968                 }
5969         }
5970
5971         if (ops->ndo_set_rx_mode)
5972                 ops->ndo_set_rx_mode(dev);
5973 }
5974
5975 void dev_set_rx_mode(struct net_device *dev)
5976 {
5977         netif_addr_lock_bh(dev);
5978         __dev_set_rx_mode(dev);
5979         netif_addr_unlock_bh(dev);
5980 }
5981
5982 /**
5983  *      dev_get_flags - get flags reported to userspace
5984  *      @dev: device
5985  *
5986  *      Get the combination of flag bits exported through APIs to userspace.
5987  */
5988 unsigned int dev_get_flags(const struct net_device *dev)
5989 {
5990         unsigned int flags;
5991
5992         flags = (dev->flags & ~(IFF_PROMISC |
5993                                 IFF_ALLMULTI |
5994                                 IFF_RUNNING |
5995                                 IFF_LOWER_UP |
5996                                 IFF_DORMANT)) |
5997                 (dev->gflags & (IFF_PROMISC |
5998                                 IFF_ALLMULTI));
5999
6000         if (netif_running(dev)) {
6001                 if (netif_oper_up(dev))
6002                         flags |= IFF_RUNNING;
6003                 if (netif_carrier_ok(dev))
6004                         flags |= IFF_LOWER_UP;
6005                 if (netif_dormant(dev))
6006                         flags |= IFF_DORMANT;
6007         }
6008
6009         return flags;
6010 }
6011 EXPORT_SYMBOL(dev_get_flags);
6012
6013 int __dev_change_flags(struct net_device *dev, unsigned int flags)
6014 {
6015         unsigned int old_flags = dev->flags;
6016         int ret;
6017
6018         ASSERT_RTNL();
6019
6020         /*
6021          *      Set the flags on our device.
6022          */
6023
6024         dev->flags = (flags & (IFF_DEBUG | IFF_NOTRAILERS | IFF_NOARP |
6025                                IFF_DYNAMIC | IFF_MULTICAST | IFF_PORTSEL |
6026                                IFF_AUTOMEDIA)) |
6027                      (dev->flags & (IFF_UP | IFF_VOLATILE | IFF_PROMISC |
6028                                     IFF_ALLMULTI));
6029
6030         /*
6031          *      Load in the correct multicast list now the flags have changed.
6032          */
6033
6034         if ((old_flags ^ flags) & IFF_MULTICAST)
6035                 dev_change_rx_flags(dev, IFF_MULTICAST);
6036
6037         dev_set_rx_mode(dev);
6038
6039         /*
6040          *      Have we downed the interface. We handle IFF_UP ourselves
6041          *      according to user attempts to set it, rather than blindly
6042          *      setting it.
6043          */
6044
6045         ret = 0;
6046         if ((old_flags ^ flags) & IFF_UP)
6047                 ret = ((old_flags & IFF_UP) ? __dev_close : __dev_open)(dev);
6048
6049         if ((flags ^ dev->gflags) & IFF_PROMISC) {
6050                 int inc = (flags & IFF_PROMISC) ? 1 : -1;
6051                 unsigned int old_flags = dev->flags;
6052
6053                 dev->gflags ^= IFF_PROMISC;
6054
6055                 if (__dev_set_promiscuity(dev, inc, false) >= 0)
6056                         if (dev->flags != old_flags)
6057                                 dev_set_rx_mode(dev);
6058         }
6059
6060         /* NOTE: order of synchronization of IFF_PROMISC and IFF_ALLMULTI
6061            is important. Some (broken) drivers set IFF_PROMISC, when
6062            IFF_ALLMULTI is requested not asking us and not reporting.
6063          */
6064         if ((flags ^ dev->gflags) & IFF_ALLMULTI) {
6065                 int inc = (flags & IFF_ALLMULTI) ? 1 : -1;
6066
6067                 dev->gflags ^= IFF_ALLMULTI;
6068                 __dev_set_allmulti(dev, inc, false);
6069         }
6070
6071         return ret;
6072 }
6073
6074 void __dev_notify_flags(struct net_device *dev, unsigned int old_flags,
6075                         unsigned int gchanges)
6076 {
6077         unsigned int changes = dev->flags ^ old_flags;
6078
6079         if (gchanges)
6080                 rtmsg_ifinfo(RTM_NEWLINK, dev, gchanges, GFP_ATOMIC);
6081
6082         if (changes & IFF_UP) {
6083                 if (dev->flags & IFF_UP)
6084                         call_netdevice_notifiers(NETDEV_UP, dev);
6085                 else
6086                         call_netdevice_notifiers(NETDEV_DOWN, dev);
6087         }
6088
6089         if (dev->flags & IFF_UP &&
6090             (changes & ~(IFF_UP | IFF_PROMISC | IFF_ALLMULTI | IFF_VOLATILE))) {
6091                 struct netdev_notifier_change_info change_info;
6092
6093                 change_info.flags_changed = changes;
6094                 call_netdevice_notifiers_info(NETDEV_CHANGE, dev,
6095                                               &change_info.info);
6096         }
6097 }
6098
6099 /**
6100  *      dev_change_flags - change device settings
6101  *      @dev: device
6102  *      @flags: device state flags
6103  *
6104  *      Change settings on device based state flags. The flags are
6105  *      in the userspace exported format.
6106  */
6107 int dev_change_flags(struct net_device *dev, unsigned int flags)
6108 {
6109         int ret;
6110         unsigned int changes, old_flags = dev->flags, old_gflags = dev->gflags;
6111
6112         ret = __dev_change_flags(dev, flags);
6113         if (ret < 0)
6114                 return ret;
6115
6116         changes = (old_flags ^ dev->flags) | (old_gflags ^ dev->gflags);
6117         __dev_notify_flags(dev, old_flags, changes);
6118         return ret;
6119 }
6120 EXPORT_SYMBOL(dev_change_flags);
6121
6122 static int __dev_set_mtu(struct net_device *dev, int new_mtu)
6123 {
6124         const struct net_device_ops *ops = dev->netdev_ops;
6125
6126         if (ops->ndo_change_mtu)
6127                 return ops->ndo_change_mtu(dev, new_mtu);
6128
6129         dev->mtu = new_mtu;
6130         return 0;
6131 }
6132
6133 /**
6134  *      dev_set_mtu - Change maximum transfer unit
6135  *      @dev: device
6136  *      @new_mtu: new transfer unit
6137  *
6138  *      Change the maximum transfer size of the network device.
6139  */
6140 int dev_set_mtu(struct net_device *dev, int new_mtu)
6141 {
6142         int err, orig_mtu;
6143
6144         if (new_mtu == dev->mtu)
6145                 return 0;
6146
6147         /*      MTU must be positive.    */
6148         if (new_mtu < 0)
6149                 return -EINVAL;
6150
6151         if (!netif_device_present(dev))
6152                 return -ENODEV;
6153
6154         err = call_netdevice_notifiers(NETDEV_PRECHANGEMTU, dev);
6155         err = notifier_to_errno(err);
6156         if (err)
6157                 return err;
6158
6159         orig_mtu = dev->mtu;
6160         err = __dev_set_mtu(dev, new_mtu);
6161
6162         if (!err) {
6163                 err = call_netdevice_notifiers_mtu(NETDEV_CHANGEMTU, dev,
6164                                                    orig_mtu);
6165                 err = notifier_to_errno(err);
6166                 if (err) {
6167                         /* setting mtu back and notifying everyone again,
6168                          * so that they have a chance to revert changes.
6169                          */
6170                         __dev_set_mtu(dev, orig_mtu);
6171                         call_netdevice_notifiers_mtu(NETDEV_CHANGEMTU, dev,
6172                                                      new_mtu);
6173                 }
6174         }
6175         return err;
6176 }
6177 EXPORT_SYMBOL(dev_set_mtu);
6178
6179 /**
6180  *      dev_set_group - Change group this device belongs to
6181  *      @dev: device
6182  *      @new_group: group this device should belong to
6183  */
6184 void dev_set_group(struct net_device *dev, int new_group)
6185 {
6186         dev->group = new_group;
6187 }
6188 EXPORT_SYMBOL(dev_set_group);
6189
6190 /**
6191  *      dev_set_mac_address - Change Media Access Control Address
6192  *      @dev: device
6193  *      @sa: new address
6194  *
6195  *      Change the hardware (MAC) address of the device
6196  */
6197 int dev_set_mac_address(struct net_device *dev, struct sockaddr *sa)
6198 {
6199         const struct net_device_ops *ops = dev->netdev_ops;
6200         int err;
6201
6202         if (!ops->ndo_set_mac_address)
6203                 return -EOPNOTSUPP;
6204         if (sa->sa_family != dev->type)
6205                 return -EINVAL;
6206         if (!netif_device_present(dev))
6207                 return -ENODEV;
6208         err = ops->ndo_set_mac_address(dev, sa);
6209         if (err)
6210                 return err;
6211         dev->addr_assign_type = NET_ADDR_SET;
6212         call_netdevice_notifiers(NETDEV_CHANGEADDR, dev);
6213         add_device_randomness(dev->dev_addr, dev->addr_len);
6214         return 0;
6215 }
6216 EXPORT_SYMBOL(dev_set_mac_address);
6217
6218 /**
6219  *      dev_change_carrier - Change device carrier
6220  *      @dev: device
6221  *      @new_carrier: new value
6222  *
6223  *      Change device carrier
6224  */
6225 int dev_change_carrier(struct net_device *dev, bool new_carrier)
6226 {
6227         const struct net_device_ops *ops = dev->netdev_ops;
6228
6229         if (!ops->ndo_change_carrier)
6230                 return -EOPNOTSUPP;
6231         if (!netif_device_present(dev))
6232                 return -ENODEV;
6233         return ops->ndo_change_carrier(dev, new_carrier);
6234 }
6235 EXPORT_SYMBOL(dev_change_carrier);
6236
6237 /**
6238  *      dev_get_phys_port_id - Get device physical port ID
6239  *      @dev: device
6240  *      @ppid: port ID
6241  *
6242  *      Get device physical port ID
6243  */
6244 int dev_get_phys_port_id(struct net_device *dev,
6245                          struct netdev_phys_item_id *ppid)
6246 {
6247         const struct net_device_ops *ops = dev->netdev_ops;
6248
6249         if (!ops->ndo_get_phys_port_id)
6250                 return -EOPNOTSUPP;
6251         return ops->ndo_get_phys_port_id(dev, ppid);
6252 }
6253 EXPORT_SYMBOL(dev_get_phys_port_id);
6254
6255 /**
6256  *      dev_get_phys_port_name - Get device physical port name
6257  *      @dev: device
6258  *      @name: port name
6259  *
6260  *      Get device physical port name
6261  */
6262 int dev_get_phys_port_name(struct net_device *dev,
6263                            char *name, size_t len)
6264 {
6265         const struct net_device_ops *ops = dev->netdev_ops;
6266
6267         if (!ops->ndo_get_phys_port_name)
6268                 return -EOPNOTSUPP;
6269         return ops->ndo_get_phys_port_name(dev, name, len);
6270 }
6271 EXPORT_SYMBOL(dev_get_phys_port_name);
6272
6273 /**
6274  *      dev_change_proto_down - update protocol port state information
6275  *      @dev: device
6276  *      @proto_down: new value
6277  *
6278  *      This info can be used by switch drivers to set the phys state of the
6279  *      port.
6280  */
6281 int dev_change_proto_down(struct net_device *dev, bool proto_down)
6282 {
6283         const struct net_device_ops *ops = dev->netdev_ops;
6284
6285         if (!ops->ndo_change_proto_down)
6286                 return -EOPNOTSUPP;
6287         if (!netif_device_present(dev))
6288                 return -ENODEV;
6289         return ops->ndo_change_proto_down(dev, proto_down);
6290 }
6291 EXPORT_SYMBOL(dev_change_proto_down);
6292
6293 /**
6294  *      dev_new_index   -       allocate an ifindex
6295  *      @net: the applicable net namespace
6296  *
6297  *      Returns a suitable unique value for a new device interface
6298  *      number.  The caller must hold the rtnl semaphore or the
6299  *      dev_base_lock to be sure it remains unique.
6300  */
6301 static int dev_new_index(struct net *net)
6302 {
6303         int ifindex = net->ifindex;
6304         for (;;) {
6305                 if (++ifindex <= 0)
6306                         ifindex = 1;
6307                 if (!__dev_get_by_index(net, ifindex))
6308                         return net->ifindex = ifindex;
6309         }
6310 }
6311
6312 /* Delayed registration/unregisteration */
6313 static LIST_HEAD(net_todo_list);
6314 DECLARE_WAIT_QUEUE_HEAD(netdev_unregistering_wq);
6315
6316 static void net_set_todo(struct net_device *dev)
6317 {
6318         list_add_tail(&dev->todo_list, &net_todo_list);
6319         dev_net(dev)->dev_unreg_count++;
6320 }
6321
6322 static void rollback_registered_many(struct list_head *head)
6323 {
6324         struct net_device *dev, *tmp;
6325         LIST_HEAD(close_head);
6326
6327         BUG_ON(dev_boot_phase);
6328         ASSERT_RTNL();
6329
6330         list_for_each_entry_safe(dev, tmp, head, unreg_list) {
6331                 /* Some devices call without registering
6332                  * for initialization unwind. Remove those
6333                  * devices and proceed with the remaining.
6334                  */
6335                 if (dev->reg_state == NETREG_UNINITIALIZED) {
6336                         pr_debug("unregister_netdevice: device %s/%p never was registered\n",
6337                                  dev->name, dev);
6338
6339                         WARN_ON(1);
6340                         list_del(&dev->unreg_list);
6341                         continue;
6342                 }
6343                 dev->dismantle = true;
6344                 BUG_ON(dev->reg_state != NETREG_REGISTERED);
6345         }
6346
6347         /* If device is running, close it first. */
6348         list_for_each_entry(dev, head, unreg_list)
6349                 list_add_tail(&dev->close_list, &close_head);
6350         dev_close_many(&close_head, true);
6351
6352         list_for_each_entry(dev, head, unreg_list) {
6353                 /* And unlink it from device chain. */
6354                 unlist_netdevice(dev);
6355
6356                 dev->reg_state = NETREG_UNREGISTERING;
6357                 on_each_cpu(flush_backlog, dev, 1);
6358         }
6359
6360         synchronize_net();
6361
6362         list_for_each_entry(dev, head, unreg_list) {
6363                 struct sk_buff *skb = NULL;
6364
6365                 /* Shutdown queueing discipline. */
6366                 dev_shutdown(dev);
6367
6368
6369                 /* Notify protocols, that we are about to destroy
6370                    this device. They should clean all the things.
6371                 */
6372                 call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
6373
6374                 if (!dev->rtnl_link_ops ||
6375                     dev->rtnl_link_state == RTNL_LINK_INITIALIZED)
6376                         skb = rtmsg_ifinfo_build_skb(RTM_DELLINK, dev, ~0U,
6377                                                      GFP_KERNEL);
6378
6379                 /*
6380                  *      Flush the unicast and multicast chains
6381                  */
6382                 dev_uc_flush(dev);
6383                 dev_mc_flush(dev);
6384
6385                 if (dev->netdev_ops->ndo_uninit)
6386                         dev->netdev_ops->ndo_uninit(dev);
6387
6388                 if (skb)
6389                         rtmsg_ifinfo_send(skb, dev, GFP_KERNEL);
6390
6391                 /* Notifier chain MUST detach us all upper devices. */
6392                 WARN_ON(netdev_has_any_upper_dev(dev));
6393
6394                 /* Remove entries from kobject tree */
6395                 netdev_unregister_kobject(dev);
6396 #ifdef CONFIG_XPS
6397                 /* Remove XPS queueing entries */
6398                 netif_reset_xps_queues_gt(dev, 0);
6399 #endif
6400         }
6401
6402         synchronize_net();
6403
6404         list_for_each_entry(dev, head, unreg_list)
6405                 dev_put(dev);
6406 }
6407
6408 static void rollback_registered(struct net_device *dev)
6409 {
6410         LIST_HEAD(single);
6411
6412         list_add(&dev->unreg_list, &single);
6413         rollback_registered_many(&single);
6414         list_del(&single);
6415 }
6416
6417 static netdev_features_t netdev_sync_upper_features(struct net_device *lower,
6418         struct net_device *upper, netdev_features_t features)
6419 {
6420         netdev_features_t upper_disables = NETIF_F_UPPER_DISABLES;
6421         netdev_features_t feature;
6422         int feature_bit;
6423
6424         for_each_netdev_feature(&upper_disables, feature_bit) {
6425                 feature = __NETIF_F_BIT(feature_bit);
6426                 if (!(upper->wanted_features & feature)
6427                     && (features & feature)) {
6428                         netdev_dbg(lower, "Dropping feature %pNF, upper dev %s has it off.\n",
6429                                    &feature, upper->name);
6430                         features &= ~feature;
6431                 }
6432         }
6433
6434         return features;
6435 }
6436
6437 static void netdev_sync_lower_features(struct net_device *upper,
6438         struct net_device *lower, netdev_features_t features)
6439 {
6440         netdev_features_t upper_disables = NETIF_F_UPPER_DISABLES;
6441         netdev_features_t feature;
6442         int feature_bit;
6443
6444         for_each_netdev_feature(&upper_disables, feature_bit) {
6445                 feature = __NETIF_F_BIT(feature_bit);
6446                 if (!(features & feature) && (lower->features & feature)) {
6447                         netdev_dbg(upper, "Disabling feature %pNF on lower dev %s.\n",
6448                                    &feature, lower->name);
6449                         lower->wanted_features &= ~feature;
6450                         netdev_update_features(lower);
6451
6452                         if (unlikely(lower->features & feature))
6453                                 netdev_WARN(upper, "failed to disable %pNF on %s!\n",
6454                                             &feature, lower->name);
6455                 }
6456         }
6457 }
6458
6459 static netdev_features_t netdev_fix_features(struct net_device *dev,
6460         netdev_features_t features)
6461 {
6462         /* Fix illegal checksum combinations */
6463         if ((features & NETIF_F_HW_CSUM) &&
6464             (features & (NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))) {
6465                 netdev_warn(dev, "mixed HW and IP checksum settings.\n");
6466                 features &= ~(NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM);
6467         }
6468
6469         /* TSO requires that SG is present as well. */
6470         if ((features & NETIF_F_ALL_TSO) && !(features & NETIF_F_SG)) {
6471                 netdev_dbg(dev, "Dropping TSO features since no SG feature.\n");
6472                 features &= ~NETIF_F_ALL_TSO;
6473         }
6474
6475         if ((features & NETIF_F_TSO) && !(features & NETIF_F_HW_CSUM) &&
6476                                         !(features & NETIF_F_IP_CSUM)) {
6477                 netdev_dbg(dev, "Dropping TSO features since no CSUM feature.\n");
6478                 features &= ~NETIF_F_TSO;
6479                 features &= ~NETIF_F_TSO_ECN;
6480         }
6481
6482         if ((features & NETIF_F_TSO6) && !(features & NETIF_F_HW_CSUM) &&
6483                                          !(features & NETIF_F_IPV6_CSUM)) {
6484                 netdev_dbg(dev, "Dropping TSO6 features since no CSUM feature.\n");
6485                 features &= ~NETIF_F_TSO6;
6486         }
6487
6488         /* TSO ECN requires that TSO is present as well. */
6489         if ((features & NETIF_F_ALL_TSO) == NETIF_F_TSO_ECN)
6490                 features &= ~NETIF_F_TSO_ECN;
6491
6492         /* Software GSO depends on SG. */
6493         if ((features & NETIF_F_GSO) && !(features & NETIF_F_SG)) {
6494                 netdev_dbg(dev, "Dropping NETIF_F_GSO since no SG feature.\n");
6495                 features &= ~NETIF_F_GSO;
6496         }
6497
6498         /* UFO needs SG and checksumming */
6499         if (features & NETIF_F_UFO) {
6500                 /* maybe split UFO into V4 and V6? */
6501                 if (!((features & NETIF_F_GEN_CSUM) ||
6502                     (features & (NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))
6503                             == (NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))) {
6504                         netdev_dbg(dev,
6505                                 "Dropping NETIF_F_UFO since no checksum offload features.\n");
6506                         features &= ~NETIF_F_UFO;
6507                 }
6508
6509                 if (!(features & NETIF_F_SG)) {
6510                         netdev_dbg(dev,
6511                                 "Dropping NETIF_F_UFO since no NETIF_F_SG feature.\n");
6512                         features &= ~NETIF_F_UFO;
6513                 }
6514         }
6515
6516 #ifdef CONFIG_NET_RX_BUSY_POLL
6517         if (dev->netdev_ops->ndo_busy_poll)
6518                 features |= NETIF_F_BUSY_POLL;
6519         else
6520 #endif
6521                 features &= ~NETIF_F_BUSY_POLL;
6522
6523         return features;
6524 }
6525
6526 int __netdev_update_features(struct net_device *dev)
6527 {
6528         struct net_device *upper, *lower;
6529         netdev_features_t features;
6530         struct list_head *iter;
6531         int err = -1;
6532
6533         ASSERT_RTNL();
6534
6535         features = netdev_get_wanted_features(dev);
6536
6537         if (dev->netdev_ops->ndo_fix_features)
6538                 features = dev->netdev_ops->ndo_fix_features(dev, features);
6539
6540         /* driver might be less strict about feature dependencies */
6541         features = netdev_fix_features(dev, features);
6542
6543         /* some features can't be enabled if they're off an an upper device */
6544         netdev_for_each_upper_dev_rcu(dev, upper, iter)
6545                 features = netdev_sync_upper_features(dev, upper, features);
6546
6547         if (dev->features == features)
6548                 goto sync_lower;
6549
6550         netdev_dbg(dev, "Features changed: %pNF -> %pNF\n",
6551                 &dev->features, &features);
6552
6553         if (dev->netdev_ops->ndo_set_features)
6554                 err = dev->netdev_ops->ndo_set_features(dev, features);
6555         else
6556                 err = 0;
6557
6558         if (unlikely(err < 0)) {
6559                 netdev_err(dev,
6560                         "set_features() failed (%d); wanted %pNF, left %pNF\n",
6561                         err, &features, &dev->features);
6562                 /* return non-0 since some features might have changed and
6563                  * it's better to fire a spurious notification than miss it
6564                  */
6565                 return -1;
6566         }
6567
6568 sync_lower:
6569         /* some features must be disabled on lower devices when disabled
6570          * on an upper device (think: bonding master or bridge)
6571          */
6572         netdev_for_each_lower_dev(dev, lower, iter)
6573                 netdev_sync_lower_features(dev, lower, features);
6574
6575         if (!err)
6576                 dev->features = features;
6577
6578         return err < 0 ? 0 : 1;
6579 }
6580
6581 /**
6582  *      netdev_update_features - recalculate device features
6583  *      @dev: the device to check
6584  *
6585  *      Recalculate dev->features set and send notifications if it
6586  *      has changed. Should be called after driver or hardware dependent
6587  *      conditions might have changed that influence the features.
6588  */
6589 void netdev_update_features(struct net_device *dev)
6590 {
6591         if (__netdev_update_features(dev))
6592                 netdev_features_change(dev);
6593 }
6594 EXPORT_SYMBOL(netdev_update_features);
6595
6596 /**
6597  *      netdev_change_features - recalculate device features
6598  *      @dev: the device to check
6599  *
6600  *      Recalculate dev->features set and send notifications even
6601  *      if they have not changed. Should be called instead of
6602  *      netdev_update_features() if also dev->vlan_features might
6603  *      have changed to allow the changes to be propagated to stacked
6604  *      VLAN devices.
6605  */
6606 void netdev_change_features(struct net_device *dev)
6607 {
6608         __netdev_update_features(dev);
6609         netdev_features_change(dev);
6610 }
6611 EXPORT_SYMBOL(netdev_change_features);
6612
6613 /**
6614  *      netif_stacked_transfer_operstate -      transfer operstate
6615  *      @rootdev: the root or lower level device to transfer state from
6616  *      @dev: the device to transfer operstate to
6617  *
6618  *      Transfer operational state from root to device. This is normally
6619  *      called when a stacking relationship exists between the root
6620  *      device and the device(a leaf device).
6621  */
6622 void netif_stacked_transfer_operstate(const struct net_device *rootdev,
6623                                         struct net_device *dev)
6624 {
6625         if (rootdev->operstate == IF_OPER_DORMANT)
6626                 netif_dormant_on(dev);
6627         else
6628                 netif_dormant_off(dev);
6629
6630         if (netif_carrier_ok(rootdev)) {
6631                 if (!netif_carrier_ok(dev))
6632                         netif_carrier_on(dev);
6633         } else {
6634                 if (netif_carrier_ok(dev))
6635                         netif_carrier_off(dev);
6636         }
6637 }
6638 EXPORT_SYMBOL(netif_stacked_transfer_operstate);
6639
6640 #ifdef CONFIG_SYSFS
6641 static int netif_alloc_rx_queues(struct net_device *dev)
6642 {
6643         unsigned int i, count = dev->num_rx_queues;
6644         struct netdev_rx_queue *rx;
6645         size_t sz = count * sizeof(*rx);
6646
6647         BUG_ON(count < 1);
6648
6649         rx = kzalloc(sz, GFP_KERNEL | __GFP_NOWARN | __GFP_REPEAT);
6650         if (!rx) {
6651                 rx = vzalloc(sz);
6652                 if (!rx)
6653                         return -ENOMEM;
6654         }
6655         dev->_rx = rx;
6656
6657         for (i = 0; i < count; i++)
6658                 rx[i].dev = dev;
6659         return 0;
6660 }
6661 #endif
6662
6663 static void netdev_init_one_queue(struct net_device *dev,
6664                                   struct netdev_queue *queue, void *_unused)
6665 {
6666         /* Initialize queue lock */
6667         spin_lock_init(&queue->_xmit_lock);
6668         netdev_set_xmit_lockdep_class(&queue->_xmit_lock, dev->type);
6669         queue->xmit_lock_owner = -1;
6670         netdev_queue_numa_node_write(queue, NUMA_NO_NODE);
6671         queue->dev = dev;
6672 #ifdef CONFIG_BQL
6673         dql_init(&queue->dql, HZ);
6674 #endif
6675 }
6676
6677 static void netif_free_tx_queues(struct net_device *dev)
6678 {
6679         kvfree(dev->_tx);
6680 }
6681
6682 static int netif_alloc_netdev_queues(struct net_device *dev)
6683 {
6684         unsigned int count = dev->num_tx_queues;
6685         struct netdev_queue *tx;
6686         size_t sz = count * sizeof(*tx);
6687
6688         if (count < 1 || count > 0xffff)
6689                 return -EINVAL;
6690
6691         tx = kzalloc(sz, GFP_KERNEL | __GFP_NOWARN | __GFP_REPEAT);
6692         if (!tx) {
6693                 tx = vzalloc(sz);
6694                 if (!tx)
6695                         return -ENOMEM;
6696         }
6697         dev->_tx = tx;
6698
6699         netdev_for_each_tx_queue(dev, netdev_init_one_queue, NULL);
6700         spin_lock_init(&dev->tx_global_lock);
6701
6702         return 0;
6703 }
6704
6705 void netif_tx_stop_all_queues(struct net_device *dev)
6706 {
6707         unsigned int i;
6708
6709         for (i = 0; i < dev->num_tx_queues; i++) {
6710                 struct netdev_queue *txq = netdev_get_tx_queue(dev, i);
6711                 netif_tx_stop_queue(txq);
6712         }
6713 }
6714 EXPORT_SYMBOL(netif_tx_stop_all_queues);
6715
6716 /**
6717  *      register_netdevice      - register a network device
6718  *      @dev: device to register
6719  *
6720  *      Take a completed network device structure and add it to the kernel
6721  *      interfaces. A %NETDEV_REGISTER message is sent to the netdev notifier
6722  *      chain. 0 is returned on success. A negative errno code is returned
6723  *      on a failure to set up the device, or if the name is a duplicate.
6724  *
6725  *      Callers must hold the rtnl semaphore. You may want
6726  *      register_netdev() instead of this.
6727  *
6728  *      BUGS:
6729  *      The locking appears insufficient to guarantee two parallel registers
6730  *      will not get the same name.
6731  */
6732
6733 int register_netdevice(struct net_device *dev)
6734 {
6735         int ret;
6736         struct net *net = dev_net(dev);
6737
6738         BUG_ON(dev_boot_phase);
6739         ASSERT_RTNL();
6740
6741         might_sleep();
6742
6743         /* When net_device's are persistent, this will be fatal. */
6744         BUG_ON(dev->reg_state != NETREG_UNINITIALIZED);
6745         BUG_ON(!net);
6746
6747         spin_lock_init(&dev->addr_list_lock);
6748         netdev_set_addr_lockdep_class(dev);
6749
6750         ret = dev_get_valid_name(net, dev, dev->name);
6751         if (ret < 0)
6752                 goto out;
6753
6754         /* Init, if this function is available */
6755         if (dev->netdev_ops->ndo_init) {
6756                 ret = dev->netdev_ops->ndo_init(dev);
6757                 if (ret) {
6758                         if (ret > 0)
6759                                 ret = -EIO;
6760                         goto out;
6761                 }
6762         }
6763
6764         if (((dev->hw_features | dev->features) &
6765              NETIF_F_HW_VLAN_CTAG_FILTER) &&
6766             (!dev->netdev_ops->ndo_vlan_rx_add_vid ||
6767              !dev->netdev_ops->ndo_vlan_rx_kill_vid)) {
6768                 netdev_WARN(dev, "Buggy VLAN acceleration in driver!\n");
6769                 ret = -EINVAL;
6770                 goto err_uninit;
6771         }
6772
6773         ret = -EBUSY;
6774         if (!dev->ifindex)
6775                 dev->ifindex = dev_new_index(net);
6776         else if (__dev_get_by_index(net, dev->ifindex))
6777                 goto err_uninit;
6778
6779         /* Transfer changeable features to wanted_features and enable
6780          * software offloads (GSO and GRO).
6781          */
6782         dev->hw_features |= NETIF_F_SOFT_FEATURES;
6783         dev->features |= NETIF_F_SOFT_FEATURES;
6784         dev->wanted_features = dev->features & dev->hw_features;
6785
6786         if (!(dev->flags & IFF_LOOPBACK)) {
6787                 dev->hw_features |= NETIF_F_NOCACHE_COPY;
6788         }
6789
6790         /* Make NETIF_F_HIGHDMA inheritable to VLAN devices.
6791          */
6792         dev->vlan_features |= NETIF_F_HIGHDMA;
6793
6794         /* Make NETIF_F_SG inheritable to tunnel devices.
6795          */
6796         dev->hw_enc_features |= NETIF_F_SG;
6797
6798         /* Make NETIF_F_SG inheritable to MPLS.
6799          */
6800         dev->mpls_features |= NETIF_F_SG;
6801
6802         ret = call_netdevice_notifiers(NETDEV_POST_INIT, dev);
6803         ret = notifier_to_errno(ret);
6804         if (ret)
6805                 goto err_uninit;
6806
6807         ret = netdev_register_kobject(dev);
6808         if (ret)
6809                 goto err_uninit;
6810         dev->reg_state = NETREG_REGISTERED;
6811
6812         __netdev_update_features(dev);
6813
6814         /*
6815          *      Default initial state at registry is that the
6816          *      device is present.
6817          */
6818
6819         set_bit(__LINK_STATE_PRESENT, &dev->state);
6820
6821         linkwatch_init_dev(dev);
6822
6823         dev_init_scheduler(dev);
6824         dev_hold(dev);
6825         list_netdevice(dev);
6826         add_device_randomness(dev->dev_addr, dev->addr_len);
6827
6828         /* If the device has permanent device address, driver should
6829          * set dev_addr and also addr_assign_type should be set to
6830          * NET_ADDR_PERM (default value).
6831          */
6832         if (dev->addr_assign_type == NET_ADDR_PERM)
6833                 memcpy(dev->perm_addr, dev->dev_addr, dev->addr_len);
6834
6835         /* Notify protocols, that a new device appeared. */
6836         ret = call_netdevice_notifiers(NETDEV_REGISTER, dev);
6837         ret = notifier_to_errno(ret);
6838         if (ret) {
6839                 rollback_registered(dev);
6840                 dev->reg_state = NETREG_UNREGISTERED;
6841         }
6842         /*
6843          *      Prevent userspace races by waiting until the network
6844          *      device is fully setup before sending notifications.
6845          */
6846         if (!dev->rtnl_link_ops ||
6847             dev->rtnl_link_state == RTNL_LINK_INITIALIZED)
6848                 rtmsg_ifinfo(RTM_NEWLINK, dev, ~0U, GFP_KERNEL);
6849
6850 out:
6851         return ret;
6852
6853 err_uninit:
6854         if (dev->netdev_ops->ndo_uninit)
6855                 dev->netdev_ops->ndo_uninit(dev);
6856         goto out;
6857 }
6858 EXPORT_SYMBOL(register_netdevice);
6859
6860 /**
6861  *      init_dummy_netdev       - init a dummy network device for NAPI
6862  *      @dev: device to init
6863  *
6864  *      This takes a network device structure and initialize the minimum
6865  *      amount of fields so it can be used to schedule NAPI polls without
6866  *      registering a full blown interface. This is to be used by drivers
6867  *      that need to tie several hardware interfaces to a single NAPI
6868  *      poll scheduler due to HW limitations.
6869  */
6870 int init_dummy_netdev(struct net_device *dev)
6871 {
6872         /* Clear everything. Note we don't initialize spinlocks
6873          * are they aren't supposed to be taken by any of the
6874          * NAPI code and this dummy netdev is supposed to be
6875          * only ever used for NAPI polls
6876          */
6877         memset(dev, 0, sizeof(struct net_device));
6878
6879         /* make sure we BUG if trying to hit standard
6880          * register/unregister code path
6881          */
6882         dev->reg_state = NETREG_DUMMY;
6883
6884         /* NAPI wants this */
6885         INIT_LIST_HEAD(&dev->napi_list);
6886
6887         /* a dummy interface is started by default */
6888         set_bit(__LINK_STATE_PRESENT, &dev->state);
6889         set_bit(__LINK_STATE_START, &dev->state);
6890
6891         /* Note : We dont allocate pcpu_refcnt for dummy devices,
6892          * because users of this 'device' dont need to change
6893          * its refcount.
6894          */
6895
6896         return 0;
6897 }
6898 EXPORT_SYMBOL_GPL(init_dummy_netdev);
6899
6900
6901 /**
6902  *      register_netdev - register a network device
6903  *      @dev: device to register
6904  *
6905  *      Take a completed network device structure and add it to the kernel
6906  *      interfaces. A %NETDEV_REGISTER message is sent to the netdev notifier
6907  *      chain. 0 is returned on success. A negative errno code is returned
6908  *      on a failure to set up the device, or if the name is a duplicate.
6909  *
6910  *      This is a wrapper around register_netdevice that takes the rtnl semaphore
6911  *      and expands the device name if you passed a format string to
6912  *      alloc_netdev.
6913  */
6914 int register_netdev(struct net_device *dev)
6915 {
6916         int err;
6917
6918         rtnl_lock();
6919         err = register_netdevice(dev);
6920         rtnl_unlock();
6921         return err;
6922 }
6923 EXPORT_SYMBOL(register_netdev);
6924
6925 int netdev_refcnt_read(const struct net_device *dev)
6926 {
6927         int i, refcnt = 0;
6928
6929         for_each_possible_cpu(i)
6930                 refcnt += *per_cpu_ptr(dev->pcpu_refcnt, i);
6931         return refcnt;
6932 }
6933 EXPORT_SYMBOL(netdev_refcnt_read);
6934
6935 /**
6936  * netdev_wait_allrefs - wait until all references are gone.
6937  * @dev: target net_device
6938  *
6939  * This is called when unregistering network devices.
6940  *
6941  * Any protocol or device that holds a reference should register
6942  * for netdevice notification, and cleanup and put back the
6943  * reference if they receive an UNREGISTER event.
6944  * We can get stuck here if buggy protocols don't correctly
6945  * call dev_put.
6946  */
6947 static void netdev_wait_allrefs(struct net_device *dev)
6948 {
6949         unsigned long rebroadcast_time, warning_time;
6950         int refcnt;
6951
6952         linkwatch_forget_dev(dev);
6953
6954         rebroadcast_time = warning_time = jiffies;
6955         refcnt = netdev_refcnt_read(dev);
6956
6957         while (refcnt != 0) {
6958                 if (time_after(jiffies, rebroadcast_time + 1 * HZ)) {
6959                         rtnl_lock();
6960
6961                         /* Rebroadcast unregister notification */
6962                         call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
6963
6964                         __rtnl_unlock();
6965                         rcu_barrier();
6966                         rtnl_lock();
6967
6968                         call_netdevice_notifiers(NETDEV_UNREGISTER_FINAL, dev);
6969                         if (test_bit(__LINK_STATE_LINKWATCH_PENDING,
6970                                      &dev->state)) {
6971                                 /* We must not have linkwatch events
6972                                  * pending on unregister. If this
6973                                  * happens, we simply run the queue
6974                                  * unscheduled, resulting in a noop
6975                                  * for this device.
6976                                  */
6977                                 linkwatch_run_queue();
6978                         }
6979
6980                         __rtnl_unlock();
6981
6982                         rebroadcast_time = jiffies;
6983                 }
6984
6985                 msleep(250);
6986
6987                 refcnt = netdev_refcnt_read(dev);
6988
6989                 if (time_after(jiffies, warning_time + 10 * HZ)) {
6990                         pr_emerg("unregister_netdevice: waiting for %s to become free. Usage count = %d\n",
6991                                  dev->name, refcnt);
6992                         warning_time = jiffies;
6993                 }
6994         }
6995 }
6996
6997 /* The sequence is:
6998  *
6999  *      rtnl_lock();
7000  *      ...
7001  *      register_netdevice(x1);
7002  *      register_netdevice(x2);
7003  *      ...
7004  *      unregister_netdevice(y1);
7005  *      unregister_netdevice(y2);
7006  *      ...
7007  *      rtnl_unlock();
7008  *      free_netdev(y1);
7009  *      free_netdev(y2);
7010  *
7011  * We are invoked by rtnl_unlock().
7012  * This allows us to deal with problems:
7013  * 1) We can delete sysfs objects which invoke hotplug
7014  *    without deadlocking with linkwatch via keventd.
7015  * 2) Since we run with the RTNL semaphore not held, we can sleep
7016  *    safely in order to wait for the netdev refcnt to drop to zero.
7017  *
7018  * We must not return until all unregister events added during
7019  * the interval the lock was held have been completed.
7020  */
7021 void netdev_run_todo(void)
7022 {
7023         struct list_head list;
7024
7025         /* Snapshot list, allow later requests */
7026         list_replace_init(&net_todo_list, &list);
7027
7028         __rtnl_unlock();
7029
7030
7031         /* Wait for rcu callbacks to finish before next phase */
7032         if (!list_empty(&list))
7033                 rcu_barrier();
7034
7035         while (!list_empty(&list)) {
7036                 struct net_device *dev
7037                         = list_first_entry(&list, struct net_device, todo_list);
7038                 list_del(&dev->todo_list);
7039
7040                 rtnl_lock();
7041                 call_netdevice_notifiers(NETDEV_UNREGISTER_FINAL, dev);
7042                 __rtnl_unlock();
7043
7044                 if (unlikely(dev->reg_state != NETREG_UNREGISTERING)) {
7045                         pr_err("network todo '%s' but state %d\n",
7046                                dev->name, dev->reg_state);
7047                         dump_stack();
7048                         continue;
7049                 }
7050
7051                 dev->reg_state = NETREG_UNREGISTERED;
7052
7053                 netdev_wait_allrefs(dev);
7054
7055                 /* paranoia */
7056                 BUG_ON(netdev_refcnt_read(dev));
7057                 BUG_ON(!list_empty(&dev->ptype_all));
7058                 BUG_ON(!list_empty(&dev->ptype_specific));
7059                 WARN_ON(rcu_access_pointer(dev->ip_ptr));
7060                 WARN_ON(rcu_access_pointer(dev->ip6_ptr));
7061                 WARN_ON(dev->dn_ptr);
7062
7063                 if (dev->destructor)
7064                         dev->destructor(dev);
7065
7066                 /* Report a network device has been unregistered */
7067                 rtnl_lock();
7068                 dev_net(dev)->dev_unreg_count--;
7069                 __rtnl_unlock();
7070                 wake_up(&netdev_unregistering_wq);
7071
7072                 /* Free network device */
7073                 kobject_put(&dev->dev.kobj);
7074         }
7075 }
7076
7077 /* Convert net_device_stats to rtnl_link_stats64.  They have the same
7078  * fields in the same order, with only the type differing.
7079  */
7080 void netdev_stats_to_stats64(struct rtnl_link_stats64 *stats64,
7081                              const struct net_device_stats *netdev_stats)
7082 {
7083 #if BITS_PER_LONG == 64
7084         BUILD_BUG_ON(sizeof(*stats64) != sizeof(*netdev_stats));
7085         memcpy(stats64, netdev_stats, sizeof(*stats64));
7086 #else
7087         size_t i, n = sizeof(*stats64) / sizeof(u64);
7088         const unsigned long *src = (const unsigned long *)netdev_stats;
7089         u64 *dst = (u64 *)stats64;
7090
7091         BUILD_BUG_ON(sizeof(*netdev_stats) / sizeof(unsigned long) !=
7092                      sizeof(*stats64) / sizeof(u64));
7093         for (i = 0; i < n; i++)
7094                 dst[i] = src[i];
7095 #endif
7096 }
7097 EXPORT_SYMBOL(netdev_stats_to_stats64);
7098
7099 /**
7100  *      dev_get_stats   - get network device statistics
7101  *      @dev: device to get statistics from
7102  *      @storage: place to store stats
7103  *
7104  *      Get network statistics from device. Return @storage.
7105  *      The device driver may provide its own method by setting
7106  *      dev->netdev_ops->get_stats64 or dev->netdev_ops->get_stats;
7107  *      otherwise the internal statistics structure is used.
7108  */
7109 struct rtnl_link_stats64 *dev_get_stats(struct net_device *dev,
7110                                         struct rtnl_link_stats64 *storage)
7111 {
7112         const struct net_device_ops *ops = dev->netdev_ops;
7113
7114         if (ops->ndo_get_stats64) {
7115                 memset(storage, 0, sizeof(*storage));
7116                 ops->ndo_get_stats64(dev, storage);
7117         } else if (ops->ndo_get_stats) {
7118                 netdev_stats_to_stats64(storage, ops->ndo_get_stats(dev));
7119         } else {
7120                 netdev_stats_to_stats64(storage, &dev->stats);
7121         }
7122         storage->rx_dropped += (unsigned long)atomic_long_read(&dev->rx_dropped);
7123         storage->tx_dropped += (unsigned long)atomic_long_read(&dev->tx_dropped);
7124         return storage;
7125 }
7126 EXPORT_SYMBOL(dev_get_stats);
7127
7128 struct netdev_queue *dev_ingress_queue_create(struct net_device *dev)
7129 {
7130         struct netdev_queue *queue = dev_ingress_queue(dev);
7131
7132 #ifdef CONFIG_NET_CLS_ACT
7133         if (queue)
7134                 return queue;
7135         queue = kzalloc(sizeof(*queue), GFP_KERNEL);
7136         if (!queue)
7137                 return NULL;
7138         netdev_init_one_queue(dev, queue, NULL);
7139         RCU_INIT_POINTER(queue->qdisc, &noop_qdisc);
7140         queue->qdisc_sleeping = &noop_qdisc;
7141         rcu_assign_pointer(dev->ingress_queue, queue);
7142 #endif
7143         return queue;
7144 }
7145
7146 static const struct ethtool_ops default_ethtool_ops;
7147
7148 void netdev_set_default_ethtool_ops(struct net_device *dev,
7149                                     const struct ethtool_ops *ops)
7150 {
7151         if (dev->ethtool_ops == &default_ethtool_ops)
7152                 dev->ethtool_ops = ops;
7153 }
7154 EXPORT_SYMBOL_GPL(netdev_set_default_ethtool_ops);
7155
7156 void netdev_freemem(struct net_device *dev)
7157 {
7158         char *addr = (char *)dev - dev->padded;
7159
7160         kvfree(addr);
7161 }
7162
7163 /**
7164  *      alloc_netdev_mqs - allocate network device
7165  *      @sizeof_priv:           size of private data to allocate space for
7166  *      @name:                  device name format string
7167  *      @name_assign_type:      origin of device name
7168  *      @setup:                 callback to initialize device
7169  *      @txqs:                  the number of TX subqueues to allocate
7170  *      @rxqs:                  the number of RX subqueues to allocate
7171  *
7172  *      Allocates a struct net_device with private data area for driver use
7173  *      and performs basic initialization.  Also allocates subqueue structs
7174  *      for each queue on the device.
7175  */
7176 struct net_device *alloc_netdev_mqs(int sizeof_priv, const char *name,
7177                 unsigned char name_assign_type,
7178                 void (*setup)(struct net_device *),
7179                 unsigned int txqs, unsigned int rxqs)
7180 {
7181         struct net_device *dev;
7182         size_t alloc_size;
7183         struct net_device *p;
7184
7185         BUG_ON(strlen(name) >= sizeof(dev->name));
7186
7187         if (txqs < 1) {
7188                 pr_err("alloc_netdev: Unable to allocate device with zero queues\n");
7189                 return NULL;
7190         }
7191
7192 #ifdef CONFIG_SYSFS
7193         if (rxqs < 1) {
7194                 pr_err("alloc_netdev: Unable to allocate device with zero RX queues\n");
7195                 return NULL;
7196         }
7197 #endif
7198
7199         alloc_size = sizeof(struct net_device);
7200         if (sizeof_priv) {
7201                 /* ensure 32-byte alignment of private area */
7202                 alloc_size = ALIGN(alloc_size, NETDEV_ALIGN);
7203                 alloc_size += sizeof_priv;
7204         }
7205         /* ensure 32-byte alignment of whole construct */
7206         alloc_size += NETDEV_ALIGN - 1;
7207
7208         p = kzalloc(alloc_size, GFP_KERNEL | __GFP_NOWARN | __GFP_REPEAT);
7209         if (!p)
7210                 p = vzalloc(alloc_size);
7211         if (!p)
7212                 return NULL;
7213
7214         dev = PTR_ALIGN(p, NETDEV_ALIGN);
7215         dev->padded = (char *)dev - (char *)p;
7216
7217         dev->pcpu_refcnt = alloc_percpu(int);
7218         if (!dev->pcpu_refcnt)
7219                 goto free_dev;
7220
7221         if (dev_addr_init(dev))
7222                 goto free_pcpu;
7223
7224         dev_mc_init(dev);
7225         dev_uc_init(dev);
7226
7227         dev_net_set(dev, &init_net);
7228
7229         dev->gso_max_size = GSO_MAX_SIZE;
7230         dev->gso_max_segs = GSO_MAX_SEGS;
7231         dev->gso_min_segs = 0;
7232
7233         INIT_LIST_HEAD(&dev->napi_list);
7234         INIT_LIST_HEAD(&dev->unreg_list);
7235         INIT_LIST_HEAD(&dev->close_list);
7236         INIT_LIST_HEAD(&dev->link_watch_list);
7237         INIT_LIST_HEAD(&dev->adj_list.upper);
7238         INIT_LIST_HEAD(&dev->adj_list.lower);
7239         INIT_LIST_HEAD(&dev->all_adj_list.upper);
7240         INIT_LIST_HEAD(&dev->all_adj_list.lower);
7241         INIT_LIST_HEAD(&dev->ptype_all);
7242         INIT_LIST_HEAD(&dev->ptype_specific);
7243         dev->priv_flags = IFF_XMIT_DST_RELEASE | IFF_XMIT_DST_RELEASE_PERM;
7244         setup(dev);
7245
7246         if (!dev->tx_queue_len) {
7247                 dev->priv_flags |= IFF_NO_QUEUE;
7248                 dev->tx_queue_len = 1;
7249         }
7250
7251         dev->num_tx_queues = txqs;
7252         dev->real_num_tx_queues = txqs;
7253         if (netif_alloc_netdev_queues(dev))
7254                 goto free_all;
7255
7256 #ifdef CONFIG_SYSFS
7257         dev->num_rx_queues = rxqs;
7258         dev->real_num_rx_queues = rxqs;
7259         if (netif_alloc_rx_queues(dev))
7260                 goto free_all;
7261 #endif
7262
7263         strcpy(dev->name, name);
7264         dev->name_assign_type = name_assign_type;
7265         dev->group = INIT_NETDEV_GROUP;
7266         if (!dev->ethtool_ops)
7267                 dev->ethtool_ops = &default_ethtool_ops;
7268
7269         nf_hook_ingress_init(dev);
7270
7271         return dev;
7272
7273 free_all:
7274         free_netdev(dev);
7275         return NULL;
7276
7277 free_pcpu:
7278         free_percpu(dev->pcpu_refcnt);
7279 free_dev:
7280         netdev_freemem(dev);
7281         return NULL;
7282 }
7283 EXPORT_SYMBOL(alloc_netdev_mqs);
7284
7285 /**
7286  *      free_netdev - free network device
7287  *      @dev: device
7288  *
7289  *      This function does the last stage of destroying an allocated device
7290  *      interface. The reference to the device object is released.
7291  *      If this is the last reference then it will be freed.
7292  */
7293 void free_netdev(struct net_device *dev)
7294 {
7295         struct napi_struct *p, *n;
7296
7297         netif_free_tx_queues(dev);
7298 #ifdef CONFIG_SYSFS
7299         kvfree(dev->_rx);
7300 #endif
7301
7302         kfree(rcu_dereference_protected(dev->ingress_queue, 1));
7303
7304         /* Flush device addresses */
7305         dev_addr_flush(dev);
7306
7307         list_for_each_entry_safe(p, n, &dev->napi_list, dev_list)
7308                 netif_napi_del(p);
7309
7310         free_percpu(dev->pcpu_refcnt);
7311         dev->pcpu_refcnt = NULL;
7312
7313         /*  Compatibility with error handling in drivers */
7314         if (dev->reg_state == NETREG_UNINITIALIZED) {
7315                 netdev_freemem(dev);
7316                 return;
7317         }
7318
7319         BUG_ON(dev->reg_state != NETREG_UNREGISTERED);
7320         dev->reg_state = NETREG_RELEASED;
7321
7322         /* will free via device release */
7323         put_device(&dev->dev);
7324 }
7325 EXPORT_SYMBOL(free_netdev);
7326
7327 /**
7328  *      synchronize_net -  Synchronize with packet receive processing
7329  *
7330  *      Wait for packets currently being received to be done.
7331  *      Does not block later packets from starting.
7332  */
7333 void synchronize_net(void)
7334 {
7335         might_sleep();
7336         if (rtnl_is_locked())
7337                 synchronize_rcu_expedited();
7338         else
7339                 synchronize_rcu();
7340 }
7341 EXPORT_SYMBOL(synchronize_net);
7342
7343 /**
7344  *      unregister_netdevice_queue - remove device from the kernel
7345  *      @dev: device
7346  *      @head: list
7347  *
7348  *      This function shuts down a device interface and removes it
7349  *      from the kernel tables.
7350  *      If head not NULL, device is queued to be unregistered later.
7351  *
7352  *      Callers must hold the rtnl semaphore.  You may want
7353  *      unregister_netdev() instead of this.
7354  */
7355
7356 void unregister_netdevice_queue(struct net_device *dev, struct list_head *head)
7357 {
7358         ASSERT_RTNL();
7359
7360         if (head) {
7361                 list_move_tail(&dev->unreg_list, head);
7362         } else {
7363                 rollback_registered(dev);
7364                 /* Finish processing unregister after unlock */
7365                 net_set_todo(dev);
7366         }
7367 }
7368 EXPORT_SYMBOL(unregister_netdevice_queue);
7369
7370 /**
7371  *      unregister_netdevice_many - unregister many devices
7372  *      @head: list of devices
7373  *
7374  *  Note: As most callers use a stack allocated list_head,
7375  *  we force a list_del() to make sure stack wont be corrupted later.
7376  */
7377 void unregister_netdevice_many(struct list_head *head)
7378 {
7379         struct net_device *dev;
7380
7381         if (!list_empty(head)) {
7382                 rollback_registered_many(head);
7383                 list_for_each_entry(dev, head, unreg_list)
7384                         net_set_todo(dev);
7385                 list_del(head);
7386         }
7387 }
7388 EXPORT_SYMBOL(unregister_netdevice_many);
7389
7390 /**
7391  *      unregister_netdev - remove device from the kernel
7392  *      @dev: device
7393  *
7394  *      This function shuts down a device interface and removes it
7395  *      from the kernel tables.
7396  *
7397  *      This is just a wrapper for unregister_netdevice that takes
7398  *      the rtnl semaphore.  In general you want to use this and not
7399  *      unregister_netdevice.
7400  */
7401 void unregister_netdev(struct net_device *dev)
7402 {
7403         rtnl_lock();
7404         unregister_netdevice(dev);
7405         rtnl_unlock();
7406 }
7407 EXPORT_SYMBOL(unregister_netdev);
7408
7409 /**
7410  *      dev_change_net_namespace - move device to different nethost namespace
7411  *      @dev: device
7412  *      @net: network namespace
7413  *      @pat: If not NULL name pattern to try if the current device name
7414  *            is already taken in the destination network namespace.
7415  *
7416  *      This function shuts down a device interface and moves it
7417  *      to a new network namespace. On success 0 is returned, on
7418  *      a failure a netagive errno code is returned.
7419  *
7420  *      Callers must hold the rtnl semaphore.
7421  */
7422
7423 int dev_change_net_namespace(struct net_device *dev, struct net *net, const char *pat)
7424 {
7425         int err;
7426
7427         ASSERT_RTNL();
7428
7429         /* Don't allow namespace local devices to be moved. */
7430         err = -EINVAL;
7431         if (dev->features & NETIF_F_NETNS_LOCAL)
7432                 goto out;
7433
7434         /* Ensure the device has been registrered */
7435         if (dev->reg_state != NETREG_REGISTERED)
7436                 goto out;
7437
7438         /* Get out if there is nothing todo */
7439         err = 0;
7440         if (net_eq(dev_net(dev), net))
7441                 goto out;
7442
7443         /* Pick the destination device name, and ensure
7444          * we can use it in the destination network namespace.
7445          */
7446         err = -EEXIST;
7447         if (__dev_get_by_name(net, dev->name)) {
7448                 /* We get here if we can't use the current device name */
7449                 if (!pat)
7450                         goto out;
7451                 err = dev_get_valid_name(net, dev, pat);
7452                 if (err < 0)
7453                         goto out;
7454         }
7455
7456         /*
7457          * And now a mini version of register_netdevice unregister_netdevice.
7458          */
7459
7460         /* If device is running close it first. */
7461         dev_close(dev);
7462
7463         /* And unlink it from device chain */
7464         unlist_netdevice(dev);
7465
7466         synchronize_net();
7467
7468         /* Shutdown queueing discipline. */
7469         dev_shutdown(dev);
7470
7471         /* Notify protocols, that we are about to destroy
7472            this device. They should clean all the things.
7473
7474            Note that dev->reg_state stays at NETREG_REGISTERED.
7475            This is wanted because this way 8021q and macvlan know
7476            the device is just moving and can keep their slaves up.
7477         */
7478         call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
7479         rcu_barrier();
7480         call_netdevice_notifiers(NETDEV_UNREGISTER_FINAL, dev);
7481         rtmsg_ifinfo(RTM_DELLINK, dev, ~0U, GFP_KERNEL);
7482
7483         /*
7484          *      Flush the unicast and multicast chains
7485          */
7486         dev_uc_flush(dev);
7487         dev_mc_flush(dev);
7488
7489         /* Send a netdev-removed uevent to the old namespace */
7490         kobject_uevent(&dev->dev.kobj, KOBJ_REMOVE);
7491         netdev_adjacent_del_links(dev);
7492
7493         /* Actually switch the network namespace */
7494         dev_net_set(dev, net);
7495
7496         /* If there is an ifindex conflict assign a new one */
7497         if (__dev_get_by_index(net, dev->ifindex))
7498                 dev->ifindex = dev_new_index(net);
7499
7500         /* Send a netdev-add uevent to the new namespace */
7501         kobject_uevent(&dev->dev.kobj, KOBJ_ADD);
7502         netdev_adjacent_add_links(dev);
7503
7504         /* Fixup kobjects */
7505         err = device_rename(&dev->dev, dev->name);
7506         WARN_ON(err);
7507
7508         /* Add the device back in the hashes */
7509         list_netdevice(dev);
7510
7511         /* Notify protocols, that a new device appeared. */
7512         call_netdevice_notifiers(NETDEV_REGISTER, dev);
7513
7514         /*
7515          *      Prevent userspace races by waiting until the network
7516          *      device is fully setup before sending notifications.
7517          */
7518         rtmsg_ifinfo(RTM_NEWLINK, dev, ~0U, GFP_KERNEL);
7519
7520         synchronize_net();
7521         err = 0;
7522 out:
7523         return err;
7524 }
7525 EXPORT_SYMBOL_GPL(dev_change_net_namespace);
7526
7527 static int dev_cpu_callback(struct notifier_block *nfb,
7528                             unsigned long action,
7529                             void *ocpu)
7530 {
7531         struct sk_buff **list_skb;
7532         struct sk_buff *skb;
7533         unsigned int cpu, oldcpu = (unsigned long)ocpu;
7534         struct softnet_data *sd, *oldsd;
7535
7536         if (action != CPU_DEAD && action != CPU_DEAD_FROZEN)
7537                 return NOTIFY_OK;
7538
7539         local_irq_disable();
7540         cpu = smp_processor_id();
7541         sd = &per_cpu(softnet_data, cpu);
7542         oldsd = &per_cpu(softnet_data, oldcpu);
7543
7544         /* Find end of our completion_queue. */
7545         list_skb = &sd->completion_queue;
7546         while (*list_skb)
7547                 list_skb = &(*list_skb)->next;
7548         /* Append completion queue from offline CPU. */
7549         *list_skb = oldsd->completion_queue;
7550         oldsd->completion_queue = NULL;
7551
7552         /* Append output queue from offline CPU. */
7553         if (oldsd->output_queue) {
7554                 *sd->output_queue_tailp = oldsd->output_queue;
7555                 sd->output_queue_tailp = oldsd->output_queue_tailp;
7556                 oldsd->output_queue = NULL;
7557                 oldsd->output_queue_tailp = &oldsd->output_queue;
7558         }
7559         /* Append NAPI poll list from offline CPU, with one exception :
7560          * process_backlog() must be called by cpu owning percpu backlog.
7561          * We properly handle process_queue & input_pkt_queue later.
7562          */
7563         while (!list_empty(&oldsd->poll_list)) {
7564                 struct napi_struct *napi = list_first_entry(&oldsd->poll_list,
7565                                                             struct napi_struct,
7566                                                             poll_list);
7567
7568                 list_del_init(&napi->poll_list);
7569                 if (napi->poll == process_backlog)
7570                         napi->state = 0;
7571                 else
7572                         ____napi_schedule(sd, napi);
7573         }
7574
7575         raise_softirq_irqoff(NET_TX_SOFTIRQ);
7576         local_irq_enable();
7577
7578         /* Process offline CPU's input_pkt_queue */
7579         while ((skb = __skb_dequeue(&oldsd->process_queue))) {
7580                 netif_rx_ni(skb);
7581                 input_queue_head_incr(oldsd);
7582         }
7583         while ((skb = skb_dequeue(&oldsd->input_pkt_queue))) {
7584                 netif_rx_ni(skb);
7585                 input_queue_head_incr(oldsd);
7586         }
7587
7588         return NOTIFY_OK;
7589 }
7590
7591
7592 /**
7593  *      netdev_increment_features - increment feature set by one
7594  *      @all: current feature set
7595  *      @one: new feature set
7596  *      @mask: mask feature set
7597  *
7598  *      Computes a new feature set after adding a device with feature set
7599  *      @one to the master device with current feature set @all.  Will not
7600  *      enable anything that is off in @mask. Returns the new feature set.
7601  */
7602 netdev_features_t netdev_increment_features(netdev_features_t all,
7603         netdev_features_t one, netdev_features_t mask)
7604 {
7605         if (mask & NETIF_F_GEN_CSUM)
7606                 mask |= NETIF_F_ALL_CSUM;
7607         mask |= NETIF_F_VLAN_CHALLENGED;
7608
7609         all |= one & (NETIF_F_ONE_FOR_ALL|NETIF_F_ALL_CSUM) & mask;
7610         all &= one | ~NETIF_F_ALL_FOR_ALL;
7611
7612         /* If one device supports hw checksumming, set for all. */
7613         if (all & NETIF_F_GEN_CSUM)
7614                 all &= ~(NETIF_F_ALL_CSUM & ~NETIF_F_GEN_CSUM);
7615
7616         return all;
7617 }
7618 EXPORT_SYMBOL(netdev_increment_features);
7619
7620 static struct hlist_head * __net_init netdev_create_hash(void)
7621 {
7622         int i;
7623         struct hlist_head *hash;
7624
7625         hash = kmalloc(sizeof(*hash) * NETDEV_HASHENTRIES, GFP_KERNEL);
7626         if (hash != NULL)
7627                 for (i = 0; i < NETDEV_HASHENTRIES; i++)
7628                         INIT_HLIST_HEAD(&hash[i]);
7629
7630         return hash;
7631 }
7632
7633 /* Initialize per network namespace state */
7634 static int __net_init netdev_init(struct net *net)
7635 {
7636         if (net != &init_net)
7637                 INIT_LIST_HEAD(&net->dev_base_head);
7638
7639         net->dev_name_head = netdev_create_hash();
7640         if (net->dev_name_head == NULL)
7641                 goto err_name;
7642
7643         net->dev_index_head = netdev_create_hash();
7644         if (net->dev_index_head == NULL)
7645                 goto err_idx;
7646
7647         return 0;
7648
7649 err_idx:
7650         kfree(net->dev_name_head);
7651 err_name:
7652         return -ENOMEM;
7653 }
7654
7655 /**
7656  *      netdev_drivername - network driver for the device
7657  *      @dev: network device
7658  *
7659  *      Determine network driver for device.
7660  */
7661 const char *netdev_drivername(const struct net_device *dev)
7662 {
7663         const struct device_driver *driver;
7664         const struct device *parent;
7665         const char *empty = "";
7666
7667         parent = dev->dev.parent;
7668         if (!parent)
7669                 return empty;
7670
7671         driver = parent->driver;
7672         if (driver && driver->name)
7673                 return driver->name;
7674         return empty;
7675 }
7676
7677 static void __netdev_printk(const char *level, const struct net_device *dev,
7678                             struct va_format *vaf)
7679 {
7680         if (dev && dev->dev.parent) {
7681                 dev_printk_emit(level[1] - '0',
7682                                 dev->dev.parent,
7683                                 "%s %s %s%s: %pV",
7684                                 dev_driver_string(dev->dev.parent),
7685                                 dev_name(dev->dev.parent),
7686                                 netdev_name(dev), netdev_reg_state(dev),
7687                                 vaf);
7688         } else if (dev) {
7689                 printk("%s%s%s: %pV",
7690                        level, netdev_name(dev), netdev_reg_state(dev), vaf);
7691         } else {
7692                 printk("%s(NULL net_device): %pV", level, vaf);
7693         }
7694 }
7695
7696 void netdev_printk(const char *level, const struct net_device *dev,
7697                    const char *format, ...)
7698 {
7699         struct va_format vaf;
7700         va_list args;
7701
7702         va_start(args, format);
7703
7704         vaf.fmt = format;
7705         vaf.va = &args;
7706
7707         __netdev_printk(level, dev, &vaf);
7708
7709         va_end(args);
7710 }
7711 EXPORT_SYMBOL(netdev_printk);
7712
7713 #define define_netdev_printk_level(func, level)                 \
7714 void func(const struct net_device *dev, const char *fmt, ...)   \
7715 {                                                               \
7716         struct va_format vaf;                                   \
7717         va_list args;                                           \
7718                                                                 \
7719         va_start(args, fmt);                                    \
7720                                                                 \
7721         vaf.fmt = fmt;                                          \
7722         vaf.va = &args;                                         \
7723                                                                 \
7724         __netdev_printk(level, dev, &vaf);                      \
7725                                                                 \
7726         va_end(args);                                           \
7727 }                                                               \
7728 EXPORT_SYMBOL(func);
7729
7730 define_netdev_printk_level(netdev_emerg, KERN_EMERG);
7731 define_netdev_printk_level(netdev_alert, KERN_ALERT);
7732 define_netdev_printk_level(netdev_crit, KERN_CRIT);
7733 define_netdev_printk_level(netdev_err, KERN_ERR);
7734 define_netdev_printk_level(netdev_warn, KERN_WARNING);
7735 define_netdev_printk_level(netdev_notice, KERN_NOTICE);
7736 define_netdev_printk_level(netdev_info, KERN_INFO);
7737
7738 static void __net_exit netdev_exit(struct net *net)
7739 {
7740         kfree(net->dev_name_head);
7741         kfree(net->dev_index_head);
7742 }
7743
7744 static struct pernet_operations __net_initdata netdev_net_ops = {
7745         .init = netdev_init,
7746         .exit = netdev_exit,
7747 };
7748
7749 static void __net_exit default_device_exit(struct net *net)
7750 {
7751         struct net_device *dev, *aux;
7752         /*
7753          * Push all migratable network devices back to the
7754          * initial network namespace
7755          */
7756         rtnl_lock();
7757         for_each_netdev_safe(net, dev, aux) {
7758                 int err;
7759                 char fb_name[IFNAMSIZ];
7760
7761                 /* Ignore unmoveable devices (i.e. loopback) */
7762                 if (dev->features & NETIF_F_NETNS_LOCAL)
7763                         continue;
7764
7765                 /* Leave virtual devices for the generic cleanup */
7766                 if (dev->rtnl_link_ops)
7767                         continue;
7768
7769                 /* Push remaining network devices to init_net */
7770                 snprintf(fb_name, IFNAMSIZ, "dev%d", dev->ifindex);
7771                 err = dev_change_net_namespace(dev, &init_net, fb_name);
7772                 if (err) {
7773                         pr_emerg("%s: failed to move %s to init_net: %d\n",
7774                                  __func__, dev->name, err);
7775                         BUG();
7776                 }
7777         }
7778         rtnl_unlock();
7779 }
7780
7781 static void __net_exit rtnl_lock_unregistering(struct list_head *net_list)
7782 {
7783         /* Return with the rtnl_lock held when there are no network
7784          * devices unregistering in any network namespace in net_list.
7785          */
7786         struct net *net;
7787         bool unregistering;
7788         DEFINE_WAIT_FUNC(wait, woken_wake_function);
7789
7790         add_wait_queue(&netdev_unregistering_wq, &wait);
7791         for (;;) {
7792                 unregistering = false;
7793                 rtnl_lock();
7794                 list_for_each_entry(net, net_list, exit_list) {
7795                         if (net->dev_unreg_count > 0) {
7796                                 unregistering = true;
7797                                 break;
7798                         }
7799                 }
7800                 if (!unregistering)
7801                         break;
7802                 __rtnl_unlock();
7803
7804                 wait_woken(&wait, TASK_UNINTERRUPTIBLE, MAX_SCHEDULE_TIMEOUT);
7805         }
7806         remove_wait_queue(&netdev_unregistering_wq, &wait);
7807 }
7808
7809 static void __net_exit default_device_exit_batch(struct list_head *net_list)
7810 {
7811         /* At exit all network devices most be removed from a network
7812          * namespace.  Do this in the reverse order of registration.
7813          * Do this across as many network namespaces as possible to
7814          * improve batching efficiency.
7815          */
7816         struct net_device *dev;
7817         struct net *net;
7818         LIST_HEAD(dev_kill_list);
7819
7820         /* To prevent network device cleanup code from dereferencing
7821          * loopback devices or network devices that have been freed
7822          * wait here for all pending unregistrations to complete,
7823          * before unregistring the loopback device and allowing the
7824          * network namespace be freed.
7825          *
7826          * The netdev todo list containing all network devices
7827          * unregistrations that happen in default_device_exit_batch
7828          * will run in the rtnl_unlock() at the end of
7829          * default_device_exit_batch.
7830          */
7831         rtnl_lock_unregistering(net_list);
7832         list_for_each_entry(net, net_list, exit_list) {
7833                 for_each_netdev_reverse(net, dev) {
7834                         if (dev->rtnl_link_ops && dev->rtnl_link_ops->dellink)
7835                                 dev->rtnl_link_ops->dellink(dev, &dev_kill_list);
7836                         else
7837                                 unregister_netdevice_queue(dev, &dev_kill_list);
7838                 }
7839         }
7840         unregister_netdevice_many(&dev_kill_list);
7841         rtnl_unlock();
7842 }
7843
7844 static struct pernet_operations __net_initdata default_device_ops = {
7845         .exit = default_device_exit,
7846         .exit_batch = default_device_exit_batch,
7847 };
7848
7849 /*
7850  *      Initialize the DEV module. At boot time this walks the device list and
7851  *      unhooks any devices that fail to initialise (normally hardware not
7852  *      present) and leaves us with a valid list of present and active devices.
7853  *
7854  */
7855
7856 /*
7857  *       This is called single threaded during boot, so no need
7858  *       to take the rtnl semaphore.
7859  */
7860 static int __init net_dev_init(void)
7861 {
7862         int i, rc = -ENOMEM;
7863
7864         BUG_ON(!dev_boot_phase);
7865
7866         if (dev_proc_init())
7867                 goto out;
7868
7869         if (netdev_kobject_init())
7870                 goto out;
7871
7872         INIT_LIST_HEAD(&ptype_all);
7873         for (i = 0; i < PTYPE_HASH_SIZE; i++)
7874                 INIT_LIST_HEAD(&ptype_base[i]);
7875
7876         INIT_LIST_HEAD(&offload_base);
7877
7878         if (register_pernet_subsys(&netdev_net_ops))
7879                 goto out;
7880
7881         /*
7882          *      Initialise the packet receive queues.
7883          */
7884
7885         for_each_possible_cpu(i) {
7886                 struct softnet_data *sd = &per_cpu(softnet_data, i);
7887
7888                 skb_queue_head_init(&sd->input_pkt_queue);
7889                 skb_queue_head_init(&sd->process_queue);
7890                 INIT_LIST_HEAD(&sd->poll_list);
7891                 sd->output_queue_tailp = &sd->output_queue;
7892 #ifdef CONFIG_RPS
7893                 sd->csd.func = rps_trigger_softirq;
7894                 sd->csd.info = sd;
7895                 sd->cpu = i;
7896 #endif
7897
7898                 sd->backlog.poll = process_backlog;
7899                 sd->backlog.weight = weight_p;
7900         }
7901
7902         dev_boot_phase = 0;
7903
7904         /* The loopback device is special if any other network devices
7905          * is present in a network namespace the loopback device must
7906          * be present. Since we now dynamically allocate and free the
7907          * loopback device ensure this invariant is maintained by
7908          * keeping the loopback device as the first device on the
7909          * list of network devices.  Ensuring the loopback devices
7910          * is the first device that appears and the last network device
7911          * that disappears.
7912          */
7913         if (register_pernet_device(&loopback_net_ops))
7914                 goto out;
7915
7916         if (register_pernet_device(&default_device_ops))
7917                 goto out;
7918
7919         open_softirq(NET_TX_SOFTIRQ, net_tx_action);
7920         open_softirq(NET_RX_SOFTIRQ, net_rx_action);
7921
7922         hotcpu_notifier(dev_cpu_callback, 0);
7923         dst_subsys_init();
7924         rc = 0;
7925 out:
7926         return rc;
7927 }
7928
7929 subsys_initcall(net_dev_init);