net/core/dev.c

   1 /*
   2  *      NET3    Protocol independent device support routines.
   3  *
   4  *              This program is free software; you can redistribute it and/or
   5  *              modify it under the terms of the GNU General Public License
   6  *              as published by the Free Software Foundation; either version
   7  *              2 of the License, or (at your option) any later version.
   8  *
   9  *      Derived from the non IP parts of dev.c 1.0.19
  10  *              Authors:        Ross Biro
  11  *                              Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
  12  *                              Mark Evans, <evansmp@uhura.aston.ac.uk>
  13  *
  14  *      Additional Authors:
  15  *              Florian la Roche <rzsfl@rz.uni-sb.de>
  16  *              Alan Cox <gw4pts@gw4pts.ampr.org>
  17  *              David Hinds <dahinds@users.sourceforge.net>
  18  *              Alexey Kuznetsov <kuznet@ms2.inr.ac.ru>
  19  *              Adam Sulmicki <adam@cfar.umd.edu>
  20  *              Pekka Riikonen <priikone@poesidon.pspt.fi>
  21  *
  22  *      Changes:
  23  *              D.J. Barrow     :       Fixed bug where dev->refcnt gets set
  24  *                                      to 2 if register_netdev gets called
  25  *                                      before net_dev_init & also removed a
  26  *                                      few lines of code in the process.
  27  *              Alan Cox        :       device private ioctl copies fields back.
  28  *              Alan Cox        :       Transmit queue code does relevant
  29  *                                      stunts to keep the queue safe.
  30  *              Alan Cox        :       Fixed double lock.
  31  *              Alan Cox        :       Fixed promisc NULL pointer trap
  32  *              ????????        :       Support the full private ioctl range
  33  *              Alan Cox        :       Moved ioctl permission check into
  34  *                                      drivers
  35  *              Tim Kordas      :       SIOCADDMULTI/SIOCDELMULTI
  36  *              Alan Cox        :       100 backlog just doesn't cut it when
  37  *                                      you start doing multicast video 8)
  38  *              Alan Cox        :       Rewrote net_bh and list manager.
  39  *              Alan Cox        :       Fix ETH_P_ALL echoback lengths.
  40  *              Alan Cox        :       Took out transmit every packet pass
  41  *                                      Saved a few bytes in the ioctl handler
  42  *              Alan Cox        :       Network driver sets packet type before
  43  *                                      calling netif_rx. Saves a function
  44  *                                      call a packet.
  45  *              Alan Cox        :       Hashed net_bh()
  46  *              Richard Kooijman:       Timestamp fixes.
  47  *              Alan Cox        :       Wrong field in SIOCGIFDSTADDR
  48  *              Alan Cox        :       Device lock protection.
  49  *              Alan Cox        :       Fixed nasty side effect of device close
  50  *                                      changes.
  51  *              Rudi Cilibrasi  :       Pass the right thing to
  52  *                                      set_mac_address()
  53  *              Dave Miller     :       32bit quantity for the device lock to
  54  *                                      make it work out on a Sparc.
  55  *              Bjorn Ekwall    :       Added KERNELD hack.
  56  *              Alan Cox        :       Cleaned up the backlog initialise.
  57  *              Craig Metz      :       SIOCGIFCONF fix if space for under
  58  *                                      1 device.
  59  *          Thomas Bogendoerfer :       Return ENODEV for dev_open, if there
  60  *                                      is no device open function.
  61  *              Andi Kleen      :       Fix error reporting for SIOCGIFCONF
  62  *          Michael Chastain    :       Fix signed/unsigned for SIOCGIFCONF
  63  *              Cyrus Durgin    :       Cleaned for KMOD
  64  *              Adam Sulmicki   :       Bug Fix : Network Device Unload
  65  *                                      A network device unload needs to purge
  66  *                                      the backlog queue.
  67  *      Paul Rusty Russell      :       SIOCSIFNAME
  68  *              Pekka Riikonen  :       Netdev boot-time settings code
  69  *              Andrew Morton   :       Make unregister_netdevice wait
  70  *                                      indefinitely on dev->refcnt
  71  *              J Hadi Salim    :       - Backlog queue sampling
  72  *                                      - netif_rx() feedback
  73  */
  74
  75 #include <asm/uaccess.h>
  76 #include <linux/bitops.h>
  77 #include <linux/capability.h>
  78 #include <linux/cpu.h>
  79 #include <linux/types.h>
  80 #include <linux/kernel.h>
  81 #include <linux/hash.h>
  82 #include <linux/slab.h>
  83 #include <linux/sched.h>
  84 #include <linux/mutex.h>
  85 #include <linux/string.h>
  86 #include <linux/mm.h>
  87 #include <linux/socket.h>
  88 #include <linux/sockios.h>
  89 #include <linux/errno.h>
  90 #include <linux/interrupt.h>
  91 #include <linux/if_ether.h>
  92 #include <linux/netdevice.h>
  93 #include <linux/etherdevice.h>
  94 #include <linux/ethtool.h>
  95 #include <linux/notifier.h>
  96 #include <linux/skbuff.h>
  97 #include <net/net_namespace.h>
  98 #include <net/sock.h>
  99 #include <linux/rtnetlink.h>
 100 #include <linux/stat.h>
 101 #include <net/dst.h>
 102 #include <net/pkt_sched.h>
 103 #include <net/checksum.h>
 104 #include <net/xfrm.h>
 105 #include <linux/highmem.h>
 106 #include <linux/init.h>
 107 #include <linux/module.h>
 108 #include <linux/netpoll.h>
 109 #include <linux/rcupdate.h>
 110 #include <linux/delay.h>
 111 #include <net/iw_handler.h>
 112 #include <asm/current.h>
 113 #include <linux/audit.h>
 114 #include <linux/dmaengine.h>
 115 #include <linux/err.h>
 116 #include <linux/ctype.h>
 117 #include <linux/if_arp.h>
 118 #include <linux/if_vlan.h>
 119 #include <linux/ip.h>
 120 #include <net/ip.h>
 121 #include <linux/ipv6.h>
 122 #include <linux/in.h>
 123 #include <linux/jhash.h>
 124 #include <linux/random.h>
 125 #include <trace/events/napi.h>
 126 #include <trace/events/net.h>
 127 #include <trace/events/skb.h>
 128 #include <linux/pci.h>
 129 #include <linux/inetdevice.h>
 130 #include <linux/cpu_rmap.h>
 131 #include <linux/static_key.h>
 132 #include <linux/hashtable.h>
 133 #include <linux/vmalloc.h>
 134 #include <linux/if_macvlan.h>
 135 #include <linux/errqueue.h>
 136
 137 #include "net-sysfs.h"
 138
 139 /* Instead of increasing this, you should create a hash table. */
 140 #define MAX_GRO_SKBS 8
 141
 142 /* This should be increased if a protocol with a bigger head is added. */
 143 #define GRO_MAX_HEAD (MAX_HEADER + 128)
 144
 145 static DEFINE_SPINLOCK(ptype_lock);
 146 static DEFINE_SPINLOCK(offload_lock);
 147 struct list_head ptype_base[PTYPE_HASH_SIZE] __read_mostly;
 148 struct list_head ptype_all __read_mostly;       /* Taps */
 149 static struct list_head offload_base __read_mostly;
 150
 151 static int netif_rx_internal(struct sk_buff *skb);
 152 static int call_netdevice_notifiers_info(unsigned long val,
 153                                          struct net_device *dev,
 154                                          struct netdev_notifier_info *info);
 155
 156 /*
 157  * The @dev_base_head list is protected by @dev_base_lock and the rtnl
 158  * semaphore.
 159  *
 160  * Pure readers hold dev_base_lock for reading, or rcu_read_lock()
 161  *
 162  * Writers must hold the rtnl semaphore while they loop through the
 163  * dev_base_head list, and hold dev_base_lock for writing when they do the
 164  * actual updates.  This allows pure readers to access the list even
 165  * while a writer is preparing to update it.
 166  *
 167  * To put it another way, dev_base_lock is held for writing only to
 168  * protect against pure readers; the rtnl semaphore provides the
 169  * protection against other writers.
 170  *
 171  * See, for example usages, register_netdevice() and
 172  * unregister_netdevice(), which must be called with the rtnl
 173  * semaphore held.
 174  */
 175 DEFINE_RWLOCK(dev_base_lock);
 176 EXPORT_SYMBOL(dev_base_lock);
 177
 178 /* protects napi_hash addition/deletion and napi_gen_id */
 179 static DEFINE_SPINLOCK(napi_hash_lock);
 180
 181 static unsigned int napi_gen_id;
 182 static DEFINE_HASHTABLE(napi_hash, 8);
 183
 184 static seqcount_t devnet_rename_seq;
 185
 186 static inline void dev_base_seq_inc(struct net *net)
 187 {
 188         while (++net->dev_base_seq == 0);
 189 }
 190
 191 static inline struct hlist_head *dev_name_hash(struct net *net, const char *name)
 192 {
 193         unsigned int hash = full_name_hash(name, strnlen(name, IFNAMSIZ));
 194
 195         return &net->dev_name_head[hash_32(hash, NETDEV_HASHBITS)];
 196 }
 197
 198 static inline struct hlist_head *dev_index_hash(struct net *net, int ifindex)
 199 {
 200         return &net->dev_index_head[ifindex & (NETDEV_HASHENTRIES - 1)];
 201 }
 202
 203 static inline void rps_lock(struct softnet_data *sd)
 204 {
 205 #ifdef CONFIG_RPS
 206         spin_lock(&sd->input_pkt_queue.lock);
 207 #endif
 208 }
 209
 210 static inline void rps_unlock(struct softnet_data *sd)
 211 {
 212 #ifdef CONFIG_RPS
 213         spin_unlock(&sd->input_pkt_queue.lock);
 214 #endif
 215 }
 216
 217 /* Device list insertion */
 218 static void list_netdevice(struct net_device *dev)
 219 {
 220         struct net *net = dev_net(dev);
 221
 222         ASSERT_RTNL();
 223
 224         write_lock_bh(&dev_base_lock);
 225         list_add_tail_rcu(&dev->dev_list, &net->dev_base_head);
 226         hlist_add_head_rcu(&dev->name_hlist, dev_name_hash(net, dev->name));
 227         hlist_add_head_rcu(&dev->index_hlist,
 228                            dev_index_hash(net, dev->ifindex));
 229         write_unlock_bh(&dev_base_lock);
 230
 231         dev_base_seq_inc(net);
 232 }
 233
 234 /* Device list removal
 235  * caller must respect a RCU grace period before freeing/reusing dev
 236  */
 237 static void unlist_netdevice(struct net_device *dev)
 238 {
 239         ASSERT_RTNL();
 240
 241         /* Unlink dev from the device chain */
 242         write_lock_bh(&dev_base_lock);
 243         list_del_rcu(&dev->dev_list);
 244         hlist_del_rcu(&dev->name_hlist);
 245         hlist_del_rcu(&dev->index_hlist);
 246         write_unlock_bh(&dev_base_lock);
 247
 248         dev_base_seq_inc(dev_net(dev));
 249 }
 250
 251 /*
 252  *      Our notifier list
 253  */
 254
 255 static RAW_NOTIFIER_HEAD(netdev_chain);
 256
 257 /*
 258  *      Device drivers call our routines to queue packets here. We empty the
 259  *      queue in the local softnet handler.
 260  */
 261
 262 DEFINE_PER_CPU_ALIGNED(struct softnet_data, softnet_data);
 263 EXPORT_PER_CPU_SYMBOL(softnet_data);
 264
 265 #ifdef CONFIG_LOCKDEP
 266 /*
 267  * register_netdevice() inits txq->_xmit_lock and sets lockdep class
 268  * according to dev->type
 269  */
 270 static const unsigned short netdev_lock_type[] =
 271         {ARPHRD_NETROM, ARPHRD_ETHER, ARPHRD_EETHER, ARPHRD_AX25,
 272          ARPHRD_PRONET, ARPHRD_CHAOS, ARPHRD_IEEE802, ARPHRD_ARCNET,
 273          ARPHRD_APPLETLK, ARPHRD_DLCI, ARPHRD_ATM, ARPHRD_METRICOM,
 274          ARPHRD_IEEE1394, ARPHRD_EUI64, ARPHRD_INFINIBAND, ARPHRD_SLIP,
 275          ARPHRD_CSLIP, ARPHRD_SLIP6, ARPHRD_CSLIP6, ARPHRD_RSRVD,
 276          ARPHRD_ADAPT, ARPHRD_ROSE, ARPHRD_X25, ARPHRD_HWX25,
 277          ARPHRD_PPP, ARPHRD_CISCO, ARPHRD_LAPB, ARPHRD_DDCMP,
 278          ARPHRD_RAWHDLC, ARPHRD_TUNNEL, ARPHRD_TUNNEL6, ARPHRD_FRAD,
 279          ARPHRD_SKIP, ARPHRD_LOOPBACK, ARPHRD_LOCALTLK, ARPHRD_FDDI,
 280          ARPHRD_BIF, ARPHRD_SIT, ARPHRD_IPDDP, ARPHRD_IPGRE,
 281          ARPHRD_PIMREG, ARPHRD_HIPPI, ARPHRD_ASH, ARPHRD_ECONET,
 282          ARPHRD_IRDA, ARPHRD_FCPP, ARPHRD_FCAL, ARPHRD_FCPL,
 283          ARPHRD_FCFABRIC, ARPHRD_IEEE80211, ARPHRD_IEEE80211_PRISM,
 284          ARPHRD_IEEE80211_RADIOTAP, ARPHRD_PHONET, ARPHRD_PHONET_PIPE,
 285          ARPHRD_IEEE802154, ARPHRD_VOID, ARPHRD_NONE};
 286
 287 static const char *const netdev_lock_name[] =
 288         {"_xmit_NETROM", "_xmit_ETHER", "_xmit_EETHER", "_xmit_AX25",
 289          "_xmit_PRONET", "_xmit_CHAOS", "_xmit_IEEE802", "_xmit_ARCNET",
 290          "_xmit_APPLETLK", "_xmit_DLCI", "_xmit_ATM", "_xmit_METRICOM",
 291          "_xmit_IEEE1394", "_xmit_EUI64", "_xmit_INFINIBAND", "_xmit_SLIP",
 292          "_xmit_CSLIP", "_xmit_SLIP6", "_xmit_CSLIP6", "_xmit_RSRVD",
 293          "_xmit_ADAPT", "_xmit_ROSE", "_xmit_X25", "_xmit_HWX25",
 294          "_xmit_PPP", "_xmit_CISCO", "_xmit_LAPB", "_xmit_DDCMP",
 295          "_xmit_RAWHDLC", "_xmit_TUNNEL", "_xmit_TUNNEL6", "_xmit_FRAD",
 296          "_xmit_SKIP", "_xmit_LOOPBACK", "_xmit_LOCALTLK", "_xmit_FDDI",
 297          "_xmit_BIF", "_xmit_SIT", "_xmit_IPDDP", "_xmit_IPGRE",
 298          "_xmit_PIMREG", "_xmit_HIPPI", "_xmit_ASH", "_xmit_ECONET",
 299          "_xmit_IRDA", "_xmit_FCPP", "_xmit_FCAL", "_xmit_FCPL",
 300          "_xmit_FCFABRIC", "_xmit_IEEE80211", "_xmit_IEEE80211_PRISM",
 301          "_xmit_IEEE80211_RADIOTAP", "_xmit_PHONET", "_xmit_PHONET_PIPE",
 302          "_xmit_IEEE802154", "_xmit_VOID", "_xmit_NONE"};
 303
 304 static struct lock_class_key netdev_xmit_lock_key[ARRAY_SIZE(netdev_lock_type)];
 305 static struct lock_class_key netdev_addr_lock_key[ARRAY_SIZE(netdev_lock_type)];
 306
 307 static inline unsigned short netdev_lock_pos(unsigned short dev_type)
 308 {
 309         int i;
 310
 311         for (i = 0; i < ARRAY_SIZE(netdev_lock_type); i++)
 312                 if (netdev_lock_type[i] == dev_type)
 313                         return i;
 314         /* the last key is used by default */
 315         return ARRAY_SIZE(netdev_lock_type) - 1;
 316 }
 317
 318 static inline void netdev_set_xmit_lockdep_class(spinlock_t *lock,
 319                                                  unsigned short dev_type)
 320 {
 321         int i;
 322
 323         i = netdev_lock_pos(dev_type);
 324         lockdep_set_class_and_name(lock, &netdev_xmit_lock_key[i],
 325                                    netdev_lock_name[i]);
 326 }
 327
 328 static inline void netdev_set_addr_lockdep_class(struct net_device *dev)
 329 {
 330         int i;
 331
 332         i = netdev_lock_pos(dev->type);
 333         lockdep_set_class_and_name(&dev->addr_list_lock,
 334                                    &netdev_addr_lock_key[i],
 335                                    netdev_lock_name[i]);
 336 }
 337 #else
 338 static inline void netdev_set_xmit_lockdep_class(spinlock_t *lock,
 339                                                  unsigned short dev_type)
 340 {
 341 }
 342 static inline void netdev_set_addr_lockdep_class(struct net_device *dev)
 343 {
 344 }
 345 #endif
 346
 347 /*******************************************************************************
 348
 349                 Protocol management and registration routines
 350
 351 *******************************************************************************/
 352
 353 /*
 354  *      Add a protocol ID to the list. Now that the input handler is
 355  *      smarter we can dispense with all the messy stuff that used to be
 356  *      here.
 357  *
 358  *      BEWARE!!! Protocol handlers, mangling input packets,
 359  *      MUST BE last in hash buckets and checking protocol handlers
 360  *      MUST start from promiscuous ptype_all chain in net_bh.
 361  *      It is true now, do not change it.
 362  *      Explanation follows: if protocol handler, mangling packet, will
 363  *      be the first on list, it is not able to sense, that packet
 364  *      is cloned and should be copied-on-write, so that it will
 365  *      change it and subsequent readers will get broken packet.
 366  *                                                      --ANK (980803)
 367  */
 368
 369 static inline struct list_head *ptype_head(const struct packet_type *pt)
 370 {
 371         if (pt->type == htons(ETH_P_ALL))
 372                 return &ptype_all;
 373         else
 374                 return &ptype_base[ntohs(pt->type) & PTYPE_HASH_MASK];
 375 }
 376
 377 /**
 378  *      dev_add_pack - add packet handler
 379  *      @pt: packet type declaration
 380  *
 381  *      Add a protocol handler to the networking stack. The passed &packet_type
 382  *      is linked into kernel lists and may not be freed until it has been
 383  *      removed from the kernel lists.
 384  *
 385  *      This call does not sleep therefore it can not
 386  *      guarantee all CPU's that are in middle of receiving packets
 387  *      will see the new packet type (until the next received packet).
 388  */
 389
 390 void dev_add_pack(struct packet_type *pt)
 391 {
 392         struct list_head *head = ptype_head(pt);
 393
 394         spin_lock(&ptype_lock);
 395         list_add_rcu(&pt->list, head);
 396         spin_unlock(&ptype_lock);
 397 }
 398 EXPORT_SYMBOL(dev_add_pack);
 399
 400 /**
 401  *      __dev_remove_pack        - remove packet handler
 402  *      @pt: packet type declaration
 403  *
 404  *      Remove a protocol handler that was previously added to the kernel
 405  *      protocol handlers by dev_add_pack(). The passed &packet_type is removed
 406  *      from the kernel lists and can be freed or reused once this function
 407  *      returns.
 408  *
 409  *      The packet type might still be in use by receivers
 410  *      and must not be freed until after all the CPU's have gone
 411  *      through a quiescent state.
 412  */
 413 void __dev_remove_pack(struct packet_type *pt)
 414 {
 415         struct list_head *head = ptype_head(pt);
 416         struct packet_type *pt1;
 417
 418         spin_lock(&ptype_lock);
 419
 420         list_for_each_entry(pt1, head, list) {
 421                 if (pt == pt1) {
 422                         list_del_rcu(&pt->list);
 423                         goto out;
 424                 }
 425         }
 426
 427         pr_warn("dev_remove_pack: %p not found\n", pt);
 428 out:
 429         spin_unlock(&ptype_lock);
 430 }
 431 EXPORT_SYMBOL(__dev_remove_pack);
 432
 433 /**
 434  *      dev_remove_pack  - remove packet handler
 435  *      @pt: packet type declaration
 436  *
 437  *      Remove a protocol handler that was previously added to the kernel
 438  *      protocol handlers by dev_add_pack(). The passed &packet_type is removed
 439  *      from the kernel lists and can be freed or reused once this function
 440  *      returns.
 441  *
 442  *      This call sleeps to guarantee that no CPU is looking at the packet
 443  *      type after return.
 444  */
 445 void dev_remove_pack(struct packet_type *pt)
 446 {
 447         __dev_remove_pack(pt);
 448
 449         synchronize_net();
 450 }
 451 EXPORT_SYMBOL(dev_remove_pack);
 452
 453
 454 /**
 455  *      dev_add_offload - register offload handlers
 456  *      @po: protocol offload declaration
 457  *
 458  *      Add protocol offload handlers to the networking stack. The passed
 459  *      &proto_offload is linked into kernel lists and may not be freed until
 460  *      it has been removed from the kernel lists.
 461  *
 462  *      This call does not sleep therefore it can not
 463  *      guarantee all CPU's that are in middle of receiving packets
 464  *      will see the new offload handlers (until the next received packet).
 465  */
 466 void dev_add_offload(struct packet_offload *po)
 467 {
 468         struct list_head *head = &offload_base;
 469
 470         spin_lock(&offload_lock);
 471         list_add_rcu(&po->list, head);
 472         spin_unlock(&offload_lock);
 473 }
 474 EXPORT_SYMBOL(dev_add_offload);
 475
 476 /**
 477  *      __dev_remove_offload     - remove offload handler
 478  *      @po: packet offload declaration
 479  *
 480  *      Remove a protocol offload handler that was previously added to the
 481  *      kernel offload handlers by dev_add_offload(). The passed &offload_type
 482  *      is removed from the kernel lists and can be freed or reused once this
 483  *      function returns.
 484  *
 485  *      The packet type might still be in use by receivers
 486  *      and must not be freed until after all the CPU's have gone
 487  *      through a quiescent state.
 488  */
 489 static void __dev_remove_offload(struct packet_offload *po)
 490 {
 491         struct list_head *head = &offload_base;
 492         struct packet_offload *po1;
 493
 494         spin_lock(&offload_lock);
 495
 496         list_for_each_entry(po1, head, list) {
 497                 if (po == po1) {
 498                         list_del_rcu(&po->list);
 499                         goto out;
 500                 }
 501         }
 502
 503         pr_warn("dev_remove_offload: %p not found\n", po);
 504 out:
 505         spin_unlock(&offload_lock);
 506 }
 507
 508 /**
 509  *      dev_remove_offload       - remove packet offload handler
 510  *      @po: packet offload declaration
 511  *
 512  *      Remove a packet offload handler that was previously added to the kernel
 513  *      offload handlers by dev_add_offload(). The passed &offload_type is
 514  *      removed from the kernel lists and can be freed or reused once this
 515  *      function returns.
 516  *
 517  *      This call sleeps to guarantee that no CPU is looking at the packet
 518  *      type after return.
 519  */
 520 void dev_remove_offload(struct packet_offload *po)
 521 {
 522         __dev_remove_offload(po);
 523
 524         synchronize_net();
 525 }
 526 EXPORT_SYMBOL(dev_remove_offload);
 527
 528 /******************************************************************************
 529
 530                       Device Boot-time Settings Routines
 531
 532 *******************************************************************************/
 533
 534 /* Boot time configuration table */
 535 static struct netdev_boot_setup dev_boot_setup[NETDEV_BOOT_SETUP_MAX];
 536
 537 /**
 538  *      netdev_boot_setup_add   - add new setup entry
 539  *      @name: name of the device
 540  *      @map: configured settings for the device
 541  *
 542  *      Adds new setup entry to the dev_boot_setup list.  The function
 543  *      returns 0 on error and 1 on success.  This is a generic routine to
 544  *      all netdevices.
 545  */
 546 static int netdev_boot_setup_add(char *name, struct ifmap *map)
 547 {
 548         struct netdev_boot_setup *s;
 549         int i;
 550
 551         s = dev_boot_setup;
 552         for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++) {
 553                 if (s[i].name[0] == '\0' || s[i].name[0] == ' ') {
 554                         memset(s[i].name, 0, sizeof(s[i].name));
 555                         strlcpy(s[i].name, name, IFNAMSIZ);
 556                         memcpy(&s[i].map, map, sizeof(s[i].map));
 557                         break;
 558                 }
 559         }
 560
 561         return i >= NETDEV_BOOT_SETUP_MAX ? 0 : 1;
 562 }
 563
 564 /**
 565  *      netdev_boot_setup_check - check boot time settings
 566  *      @dev: the netdevice
 567  *
 568  *      Check boot time settings for the device.
 569  *      The found settings are set for the device to be used
 570  *      later in the device probing.
 571  *      Returns 0 if no settings found, 1 if they are.
 572  */
 573 int netdev_boot_setup_check(struct net_device *dev)
 574 {
 575         struct netdev_boot_setup *s = dev_boot_setup;
 576         int i;
 577
 578         for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++) {
 579                 if (s[i].name[0] != '\0' && s[i].name[0] != ' ' &&
 580                     !strcmp(dev->name, s[i].name)) {
 581                         dev->irq        = s[i].map.irq;
 582                         dev->base_addr  = s[i].map.base_addr;
 583                         dev->mem_start  = s[i].map.mem_start;
 584                         dev->mem_end    = s[i].map.mem_end;
 585                         return 1;
 586                 }
 587         }
 588         return 0;
 589 }
 590 EXPORT_SYMBOL(netdev_boot_setup_check);
 591
 592
 593 /**
 594  *      netdev_boot_base        - get address from boot time settings
 595  *      @prefix: prefix for network device
 596  *      @unit: id for network device
 597  *
 598  *      Check boot time settings for the base address of device.
 599  *      The found settings are set for the device to be used
 600  *      later in the device probing.
 601  *      Returns 0 if no settings found.
 602  */
 603 unsigned long netdev_boot_base(const char *prefix, int unit)
 604 {
 605         const struct netdev_boot_setup *s = dev_boot_setup;
 606         char name[IFNAMSIZ];
 607         int i;
 608
 609         sprintf(name, "%s%d", prefix, unit);
 610
 611         /*
 612          * If device already registered then return base of 1
 613          * to indicate not to probe for this interface
 614          */
 615         if (__dev_get_by_name(&init_net, name))
 616                 return 1;
 617
 618         for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++)
 619                 if (!strcmp(name, s[i].name))
 620                         return s[i].map.base_addr;
 621         return 0;
 622 }
 623
 624 /*
 625  * Saves at boot time configured settings for any netdevice.
 626  */
 627 int __init netdev_boot_setup(char *str)
 628 {
 629         int ints[5];
 630         struct ifmap map;
 631
 632         str = get_options(str, ARRAY_SIZE(ints), ints);
 633         if (!str || !*str)
 634                 return 0;
 635
 636         /* Save settings */
 637         memset(&map, 0, sizeof(map));
 638         if (ints[0] > 0)
 639                 map.irq = ints[1];
 640         if (ints[0] > 1)
 641                 map.base_addr = ints[2];
 642         if (ints[0] > 2)
 643                 map.mem_start = ints[3];
 644         if (ints[0] > 3)
 645                 map.mem_end = ints[4];
 646
 647         /* Add new entry to the list */
 648         return netdev_boot_setup_add(str, &map);
 649 }
 650
 651 __setup("netdev=", netdev_boot_setup);
 652
 653 /*******************************************************************************
 654
 655                             Device Interface Subroutines
 656
 657 *******************************************************************************/
 658
 659 /**
 660  *      __dev_get_by_name       - find a device by its name
 661  *      @net: the applicable net namespace
 662  *      @name: name to find
 663  *
 664  *      Find an interface by name. Must be called under RTNL semaphore
 665  *      or @dev_base_lock. If the name is found a pointer to the device
 666  *      is returned. If the name is not found then %NULL is returned. The
 667  *      reference counters are not incremented so the caller must be
 668  *      careful with locks.
 669  */
 670
 671 struct net_device *__dev_get_by_name(struct net *net, const char *name)
 672 {
 673         struct net_device *dev;
 674         struct hlist_head *head = dev_name_hash(net, name);
 675
 676         hlist_for_each_entry(dev, head, name_hlist)
 677                 if (!strncmp(dev->name, name, IFNAMSIZ))
 678                         return dev;
 679
 680         return NULL;
 681 }
 682 EXPORT_SYMBOL(__dev_get_by_name);
 683
 684 /**
 685  *      dev_get_by_name_rcu     - find a device by its name
 686  *      @net: the applicable net namespace
 687  *      @name: name to find
 688  *
 689  *      Find an interface by name.
 690  *      If the name is found a pointer to the device is returned.
 691  *      If the name is not found then %NULL is returned.
 692  *      The reference counters are not incremented so the caller must be
 693  *      careful with locks. The caller must hold RCU lock.
 694  */
 695
 696 struct net_device *dev_get_by_name_rcu(struct net *net, const char *name)
 697 {
 698         struct net_device *dev;
 699         struct hlist_head *head = dev_name_hash(net, name);
 700
 701         hlist_for_each_entry_rcu(dev, head, name_hlist)
 702                 if (!strncmp(dev->name, name, IFNAMSIZ))
 703                         return dev;
 704
 705         return NULL;
 706 }
 707 EXPORT_SYMBOL(dev_get_by_name_rcu);
 708
 709 /**
 710  *      dev_get_by_name         - find a device by its name
 711  *      @net: the applicable net namespace
 712  *      @name: name to find
 713  *
 714  *      Find an interface by name. This can be called from any
 715  *      context and does its own locking. The returned handle has
 716  *      the usage count incremented and the caller must use dev_put() to
 717  *      release it when it is no longer needed. %NULL is returned if no
 718  *      matching device is found.
 719  */
 720
 721 struct net_device *dev_get_by_name(struct net *net, const char *name)
 722 {
 723         struct net_device *dev;
 724
 725         rcu_read_lock();
 726         dev = dev_get_by_name_rcu(net, name);
 727         if (dev)
 728                 dev_hold(dev);
 729         rcu_read_unlock();
 730         return dev;
 731 }
 732 EXPORT_SYMBOL(dev_get_by_name);
 733
 734 /**
 735  *      __dev_get_by_index - find a device by its ifindex
 736  *      @net: the applicable net namespace
 737  *      @ifindex: index of device
 738  *
 739  *      Search for an interface by index. Returns %NULL if the device
 740  *      is not found or a pointer to the device. The device has not
 741  *      had its reference counter increased so the caller must be careful
 742  *      about locking. The caller must hold either the RTNL semaphore
 743  *      or @dev_base_lock.
 744  */
 745
 746 struct net_device *__dev_get_by_index(struct net *net, int ifindex)
 747 {
 748         struct net_device *dev;
 749         struct hlist_head *head = dev_index_hash(net, ifindex);
 750
 751         hlist_for_each_entry(dev, head, index_hlist)
 752                 if (dev->ifindex == ifindex)
 753                         return dev;
 754
 755         return NULL;
 756 }
 757 EXPORT_SYMBOL(__dev_get_by_index);
 758
 759 /**
 760  *      dev_get_by_index_rcu - find a device by its ifindex
 761  *      @net: the applicable net namespace
 762  *      @ifindex: index of device
 763  *
 764  *      Search for an interface by index. Returns %NULL if the device
 765  *      is not found or a pointer to the device. The device has not
 766  *      had its reference counter increased so the caller must be careful
 767  *      about locking. The caller must hold RCU lock.
 768  */
 769
 770 struct net_device *dev_get_by_index_rcu(struct net *net, int ifindex)
 771 {
 772         struct net_device *dev;
 773         struct hlist_head *head = dev_index_hash(net, ifindex);
 774
 775         hlist_for_each_entry_rcu(dev, head, index_hlist)
 776                 if (dev->ifindex == ifindex)
 777                         return dev;
 778
 779         return NULL;
 780 }
 781 EXPORT_SYMBOL(dev_get_by_index_rcu);
 782
 783
 784 /**
 785  *      dev_get_by_index - find a device by its ifindex
 786  *      @net: the applicable net namespace
 787  *      @ifindex: index of device
 788  *
 789  *      Search for an interface by index. Returns NULL if the device
 790  *      is not found or a pointer to the device. The device returned has
 791  *      had a reference added and the pointer is safe until the user calls
 792  *      dev_put to indicate they have finished with it.
 793  */
 794
 795 struct net_device *dev_get_by_index(struct net *net, int ifindex)
 796 {
 797         struct net_device *dev;
 798
 799         rcu_read_lock();
 800         dev = dev_get_by_index_rcu(net, ifindex);
 801         if (dev)
 802                 dev_hold(dev);
 803         rcu_read_unlock();
 804         return dev;
 805 }
 806 EXPORT_SYMBOL(dev_get_by_index);
 807
 808 /**
 809  *      netdev_get_name - get a netdevice name, knowing its ifindex.
 810  *      @net: network namespace
 811  *      @name: a pointer to the buffer where the name will be stored.
 812  *      @ifindex: the ifindex of the interface to get the name from.
 813  *
 814  *      The use of raw_seqcount_begin() and cond_resched() before
 815  *      retrying is required as we want to give the writers a chance
 816  *      to complete when CONFIG_PREEMPT is not set.
 817  */
 818 int netdev_get_name(struct net *net, char *name, int ifindex)
 819 {
 820         struct net_device *dev;
 821         unsigned int seq;
 822
 823 retry:
 824         seq = raw_seqcount_begin(&devnet_rename_seq);
 825         rcu_read_lock();
 826         dev = dev_get_by_index_rcu(net, ifindex);
 827         if (!dev) {
 828                 rcu_read_unlock();
 829                 return -ENODEV;
 830         }
 831
 832         strcpy(name, dev->name);
 833         rcu_read_unlock();
 834         if (read_seqcount_retry(&devnet_rename_seq, seq)) {
 835                 cond_resched();
 836                 goto retry;
 837         }
 838
 839         return 0;
 840 }
 841
 842 /**
 843  *      dev_getbyhwaddr_rcu - find a device by its hardware address
 844  *      @net: the applicable net namespace
 845  *      @type: media type of device
 846  *      @ha: hardware address
 847  *
 848  *      Search for an interface by MAC address. Returns NULL if the device
 849  *      is not found or a pointer to the device.
 850  *      The caller must hold RCU or RTNL.
 851  *      The returned device has not had its ref count increased
 852  *      and the caller must therefore be careful about locking
 853  *
 854  */
 855
 856 struct net_device *dev_getbyhwaddr_rcu(struct net *net, unsigned short type,
 857                                        const char *ha)
 858 {
 859         struct net_device *dev;
 860
 861         for_each_netdev_rcu(net, dev)
 862                 if (dev->type == type &&
 863                     !memcmp(dev->dev_addr, ha, dev->addr_len))
 864                         return dev;
 865
 866         return NULL;
 867 }
 868 EXPORT_SYMBOL(dev_getbyhwaddr_rcu);
 869
 870 struct net_device *__dev_getfirstbyhwtype(struct net *net, unsigned short type)
 871 {
 872         struct net_device *dev;
 873
 874         ASSERT_RTNL();
 875         for_each_netdev(net, dev)
 876                 if (dev->type == type)
 877                         return dev;
 878
 879         return NULL;
 880 }
 881 EXPORT_SYMBOL(__dev_getfirstbyhwtype);
 882
 883 struct net_device *dev_getfirstbyhwtype(struct net *net, unsigned short type)
 884 {
 885         struct net_device *dev, *ret = NULL;
 886
 887         rcu_read_lock();
 888         for_each_netdev_rcu(net, dev)
 889                 if (dev->type == type) {
 890                         dev_hold(dev);
 891                         ret = dev;
 892                         break;
 893                 }
 894         rcu_read_unlock();
 895         return ret;
 896 }
 897 EXPORT_SYMBOL(dev_getfirstbyhwtype);
 898
 899 /**
 900  *      __dev_get_by_flags - find any device with given flags
 901  *      @net: the applicable net namespace
 902  *      @if_flags: IFF_* values
 903  *      @mask: bitmask of bits in if_flags to check
 904  *
 905  *      Search for any interface with the given flags. Returns NULL if a device
 906  *      is not found or a pointer to the device. Must be called inside
 907  *      rtnl_lock(), and result refcount is unchanged.
 908  */
 909
 910 struct net_device *__dev_get_by_flags(struct net *net, unsigned short if_flags,
 911                                       unsigned short mask)
 912 {
 913         struct net_device *dev, *ret;
 914
 915         ASSERT_RTNL();
 916
 917         ret = NULL;
 918         for_each_netdev(net, dev) {
 919                 if (((dev->flags ^ if_flags) & mask) == 0) {
 920                         ret = dev;
 921                         break;
 922                 }
 923         }
 924         return ret;
 925 }
 926 EXPORT_SYMBOL(__dev_get_by_flags);
 927
 928 /**
 929  *      dev_valid_name - check if name is okay for network device
 930  *      @name: name string
 931  *
 932  *      Network device names need to be valid file names to
 933  *      to allow sysfs to work.  We also disallow any kind of
 934  *      whitespace.
 935  */
 936 bool dev_valid_name(const char *name)
 937 {
 938         if (*name == '\0')
 939                 return false;
 940         if (strlen(name) >= IFNAMSIZ)
 941                 return false;
 942         if (!strcmp(name, ".") || !strcmp(name, ".."))
 943                 return false;
 944
 945         while (*name) {
 946                 if (*name == '/' || *name == ':' || isspace(*name))
 947                         return false;
 948                 name++;
 949         }
 950         return true;
 951 }
 952 EXPORT_SYMBOL(dev_valid_name);
 953
 954 /**
 955  *      __dev_alloc_name - allocate a name for a device
 956  *      @net: network namespace to allocate the device name in
 957  *      @name: name format string
 958  *      @buf:  scratch buffer and result name string
 959  *
 960  *      Passed a format string - eg "lt%d" it will try and find a suitable
 961  *      id. It scans list of devices to build up a free map, then chooses
 962  *      the first empty slot. The caller must hold the dev_base or rtnl lock
 963  *      while allocating the name and adding the device in order to avoid
 964  *      duplicates.
 965  *      Limited to bits_per_byte * page size devices (ie 32K on most platforms).
 966  *      Returns the number of the unit assigned or a negative errno code.
 967  */
 968
 969 static int __dev_alloc_name(struct net *net, const char *name, char *buf)
 970 {
 971         int i = 0;
 972         const char *p;
 973         const int max_netdevices = 8*PAGE_SIZE;
 974         unsigned long *inuse;
 975         struct net_device *d;
 976
 977         p = strnchr(name, IFNAMSIZ-1, '%');
 978         if (p) {
 979                 /*
 980                  * Verify the string as this thing may have come from
 981                  * the user.  There must be either one "%d" and no other "%"
 982                  * characters.
 983                  */
 984                 if (p[1] != 'd' || strchr(p + 2, '%'))
 985                         return -EINVAL;
 986
 987                 /* Use one page as a bit array of possible slots */
 988                 inuse = (unsigned long *) get_zeroed_page(GFP_ATOMIC);
 989                 if (!inuse)
 990                         return -ENOMEM;
 991
 992                 for_each_netdev(net, d) {
 993                         if (!sscanf(d->name, name, &i))
 994                                 continue;
 995                         if (i < 0 || i >= max_netdevices)
 996                                 continue;
 997
 998                         /*  avoid cases where sscanf is not exact inverse of printf */
 999                         snprintf(buf, IFNAMSIZ, name, i);
1000                         if (!strncmp(buf, d->name, IFNAMSIZ))
1001                                 set_bit(i, inuse);
1002                 }
1003
1004                 i = find_first_zero_bit(inuse, max_netdevices);
1005                 free_page((unsigned long) inuse);
1006         }
1007
1008         if (buf != name)
1009                 snprintf(buf, IFNAMSIZ, name, i);
1010         if (!__dev_get_by_name(net, buf))
1011                 return i;
1012
1013         /* It is possible to run out of possible slots
1014          * when the name is long and there isn't enough space left
1015          * for the digits, or if all bits are used.
1016          */
1017         return -ENFILE;
1018 }
1019
1020 /**
1021  *      dev_alloc_name - allocate a name for a device
1022  *      @dev: device
1023  *      @name: name format string
1024  *
1025  *      Passed a format string - eg "lt%d" it will try and find a suitable
1026  *      id. It scans list of devices to build up a free map, then chooses
1027  *      the first empty slot. The caller must hold the dev_base or rtnl lock
1028  *      while allocating the name and adding the device in order to avoid
1029  *      duplicates.
1030  *      Limited to bits_per_byte * page size devices (ie 32K on most platforms).
1031  *      Returns the number of the unit assigned or a negative errno code.
1032  */
1033
1034 int dev_alloc_name(struct net_device *dev, const char *name)
1035 {
1036         char buf[IFNAMSIZ];
1037         struct net *net;
1038         int ret;
1039
1040         BUG_ON(!dev_net(dev));
1041         net = dev_net(dev);
1042         ret = __dev_alloc_name(net, name, buf);
1043         if (ret >= 0)
1044                 strlcpy(dev->name, buf, IFNAMSIZ);
1045         return ret;
1046 }
1047 EXPORT_SYMBOL(dev_alloc_name);
1048
1049 static int dev_alloc_name_ns(struct net *net,
1050                              struct net_device *dev,
1051                              const char *name)
1052 {
1053         char buf[IFNAMSIZ];
1054         int ret;
1055
1056         ret = __dev_alloc_name(net, name, buf);
1057         if (ret >= 0)
1058                 strlcpy(dev->name, buf, IFNAMSIZ);
1059         return ret;
1060 }
1061
1062 int dev_get_valid_name(struct net *net, struct net_device *dev,
1063                        const char *name)
1064 {
1065         BUG_ON(!net);
1066
1067         if (!dev_valid_name(name))
1068                 return -EINVAL;
1069
1070         if (strchr(name, '%'))
1071                 return dev_alloc_name_ns(net, dev, name);
1072         else if (__dev_get_by_name(net, name))
1073                 return -EEXIST;
1074         else if (dev->name != name)
1075                 strlcpy(dev->name, name, IFNAMSIZ);
1076
1077         return 0;
1078 }
1079 EXPORT_SYMBOL(dev_get_valid_name);
1080
1081 /**
1082  *      dev_change_name - change name of a device
1083  *      @dev: device
1084  *      @newname: name (or format string) must be at least IFNAMSIZ
1085  *
1086  *      Change name of a device, can pass format strings "eth%d".
1087  *      for wildcarding.
1088  */
1089 int dev_change_name(struct net_device *dev, const char *newname)
1090 {
1091         unsigned char old_assign_type;
1092         char oldname[IFNAMSIZ];
1093         int err = 0;
1094         int ret;
1095         struct net *net;
1096
1097         ASSERT_RTNL();
1098         BUG_ON(!dev_net(dev));
1099
1100         net = dev_net(dev);
1101         if (dev->flags & IFF_UP)
1102                 return -EBUSY;
1103
1104         write_seqcount_begin(&devnet_rename_seq);
1105
1106         if (strncmp(newname, dev->name, IFNAMSIZ) == 0) {
1107                 write_seqcount_end(&devnet_rename_seq);
1108                 return 0;
1109         }
1110
1111         memcpy(oldname, dev->name, IFNAMSIZ);
1112
1113         err = dev_get_valid_name(net, dev, newname);
1114         if (err < 0) {
1115                 write_seqcount_end(&devnet_rename_seq);
1116                 return err;
1117         }
1118
1119         if (oldname[0] && !strchr(oldname, '%'))
1120                 netdev_info(dev, "renamed from %s\n", oldname);
1121
1122         old_assign_type = dev->name_assign_type;
1123         dev->name_assign_type = NET_NAME_RENAMED;
1124
1125 rollback:
1126         ret = device_rename(&dev->dev, dev->name);
1127         if (ret) {
1128                 memcpy(dev->name, oldname, IFNAMSIZ);
1129                 dev->name_assign_type = old_assign_type;
1130                 write_seqcount_end(&devnet_rename_seq);
1131                 return ret;
1132         }
1133
1134         write_seqcount_end(&devnet_rename_seq);
1135
1136         netdev_adjacent_rename_links(dev, oldname);
1137
1138         write_lock_bh(&dev_base_lock);
1139         hlist_del_rcu(&dev->name_hlist);
1140         write_unlock_bh(&dev_base_lock);
1141
1142         synchronize_rcu();
1143
1144         write_lock_bh(&dev_base_lock);
1145         hlist_add_head_rcu(&dev->name_hlist, dev_name_hash(net, dev->name));
1146         write_unlock_bh(&dev_base_lock);
1147
1148         ret = call_netdevice_notifiers(NETDEV_CHANGENAME, dev);
1149         ret = notifier_to_errno(ret);
1150
1151         if (ret) {
1152                 /* err >= 0 after dev_alloc_name() or stores the first errno */
1153                 if (err >= 0) {
1154                         err = ret;
1155                         write_seqcount_begin(&devnet_rename_seq);
1156                         memcpy(dev->name, oldname, IFNAMSIZ);
1157                         memcpy(oldname, newname, IFNAMSIZ);
1158                         dev->name_assign_type = old_assign_type;
1159                         old_assign_type = NET_NAME_RENAMED;
1160                         goto rollback;
1161                 } else {
1162                         pr_err("%s: name change rollback failed: %d\n",
1163                                dev->name, ret);
1164                 }
1165         }
1166
1167         return err;
1168 }
1169
1170 /**
1171  *      dev_set_alias - change ifalias of a device
1172  *      @dev: device
1173  *      @alias: name up to IFALIASZ
1174  *      @len: limit of bytes to copy from info
1175  *
1176  *      Set ifalias for a device,
1177  */
1178 int dev_set_alias(struct net_device *dev, const char *alias, size_t len)
1179 {
1180         char *new_ifalias;
1181
1182         ASSERT_RTNL();
1183
1184         if (len >= IFALIASZ)
1185                 return -EINVAL;
1186
1187         if (!len) {
1188                 kfree(dev->ifalias);
1189                 dev->ifalias = NULL;
1190                 return 0;
1191         }
1192
1193         new_ifalias = krealloc(dev->ifalias, len + 1, GFP_KERNEL);
1194         if (!new_ifalias)
1195                 return -ENOMEM;
1196         dev->ifalias = new_ifalias;
1197         memcpy(dev->ifalias, alias, len);
1198         dev->ifalias[len] = 0;
1199
1200         return len;
1201 }
1202
1203
1204 /**
1205  *      netdev_features_change - device changes features
1206  *      @dev: device to cause notification
1207  *
1208  *      Called to indicate a device has changed features.
1209  */
1210 void netdev_features_change(struct net_device *dev)
1211 {
1212         call_netdevice_notifiers(NETDEV_FEAT_CHANGE, dev);
1213 }
1214 EXPORT_SYMBOL(netdev_features_change);
1215
1216 /**
1217  *      netdev_state_change - device changes state
1218  *      @dev: device to cause notification
1219  *
1220  *      Called to indicate a device has changed state. This function calls
1221  *      the notifier chains for netdev_chain and sends a NEWLINK message
1222  *      to the routing socket.
1223  */
1224 void netdev_state_change(struct net_device *dev)
1225 {
1226         if (dev->flags & IFF_UP) {
1227                 struct netdev_notifier_change_info change_info;
1228
1229                 change_info.flags_changed = 0;
1230                 call_netdevice_notifiers_info(NETDEV_CHANGE, dev,
1231                                               &change_info.info);
1232                 rtmsg_ifinfo(RTM_NEWLINK, dev, 0, GFP_KERNEL);
1233         }
1234 }
1235 EXPORT_SYMBOL(netdev_state_change);
1236
1237 /**
1238  *      netdev_notify_peers - notify network peers about existence of @dev
1239  *      @dev: network device
1240  *
1241  * Generate traffic such that interested network peers are aware of
1242  * @dev, such as by generating a gratuitous ARP. This may be used when
1243  * a device wants to inform the rest of the network about some sort of
1244  * reconfiguration such as a failover event or virtual machine
1245  * migration.
1246  */
1247 void netdev_notify_peers(struct net_device *dev)
1248 {
1249         rtnl_lock();
1250         call_netdevice_notifiers(NETDEV_NOTIFY_PEERS, dev);
1251         call_netdevice_notifiers(NETDEV_RESEND_IGMP, dev);
1252         rtnl_unlock();
1253 }
1254 EXPORT_SYMBOL(netdev_notify_peers);
1255
1256 static int __dev_open(struct net_device *dev)
1257 {
1258         const struct net_device_ops *ops = dev->netdev_ops;
1259         int ret;
1260
1261         ASSERT_RTNL();
1262
1263         if (!netif_device_present(dev))
1264                 return -ENODEV;
1265
1266         /* Block netpoll from trying to do any rx path servicing.
1267          * If we don't do this there is a chance ndo_poll_controller
1268          * or ndo_poll may be running while we open the device
1269          */
1270         netpoll_poll_disable(dev);
1271
1272         ret = call_netdevice_notifiers(NETDEV_PRE_UP, dev);
1273         ret = notifier_to_errno(ret);
1274         if (ret)
1275                 return ret;
1276
1277         set_bit(__LINK_STATE_START, &dev->state);
1278
1279         if (ops->ndo_validate_addr)
1280                 ret = ops->ndo_validate_addr(dev);
1281
1282         if (!ret && ops->ndo_open)
1283                 ret = ops->ndo_open(dev);
1284
1285         netpoll_poll_enable(dev);
1286
1287         if (ret)
1288                 clear_bit(__LINK_STATE_START, &dev->state);
1289         else {
1290                 dev->flags |= IFF_UP;
1291                 dev_set_rx_mode(dev);
1292                 dev_activate(dev);
1293                 add_device_randomness(dev->dev_addr, dev->addr_len);
1294         }
1295
1296         return ret;
1297 }
1298
1299 /**
1300  *      dev_open        - prepare an interface for use.
1301  *      @dev:   device to open
1302  *
1303  *      Takes a device from down to up state. The device's private open
1304  *      function is invoked and then the multicast lists are loaded. Finally
1305  *      the device is moved into the up state and a %NETDEV_UP message is
1306  *      sent to the netdev notifier chain.
1307  *
1308  *      Calling this function on an active interface is a nop. On a failure
1309  *      a negative errno code is returned.
1310  */
1311 int dev_open(struct net_device *dev)
1312 {
1313         int ret;
1314
1315         if (dev->flags & IFF_UP)
1316                 return 0;
1317
1318         ret = __dev_open(dev);
1319         if (ret < 0)
1320                 return ret;
1321
1322         rtmsg_ifinfo(RTM_NEWLINK, dev, IFF_UP|IFF_RUNNING, GFP_KERNEL);
1323         call_netdevice_notifiers(NETDEV_UP, dev);
1324
1325         return ret;
1326 }
1327 EXPORT_SYMBOL(dev_open);
1328
1329 static int __dev_close_many(struct list_head *head)
1330 {
1331         struct net_device *dev;
1332
1333         ASSERT_RTNL();
1334         might_sleep();
1335
1336         list_for_each_entry(dev, head, close_list) {
1337                 /* Temporarily disable netpoll until the interface is down */
1338                 netpoll_poll_disable(dev);
1339
1340                 call_netdevice_notifiers(NETDEV_GOING_DOWN, dev);
1341
1342                 clear_bit(__LINK_STATE_START, &dev->state);
1343
1344                 /* Synchronize to scheduled poll. We cannot touch poll list, it
1345                  * can be even on different cpu. So just clear netif_running().
1346                  *
1347                  * dev->stop() will invoke napi_disable() on all of it's
1348                  * napi_struct instances on this device.
1349                  */
1350                 smp_mb__after_atomic(); /* Commit netif_running(). */
1351         }
1352
1353         dev_deactivate_many(head);
1354
1355         list_for_each_entry(dev, head, close_list) {
1356                 const struct net_device_ops *ops = dev->netdev_ops;
1357
1358                 /*
1359                  *      Call the device specific close. This cannot fail.
1360                  *      Only if device is UP
1361                  *
1362                  *      We allow it to be called even after a DETACH hot-plug
1363                  *      event.
1364                  */
1365                 if (ops->ndo_stop)
1366                         ops->ndo_stop(dev);
1367
1368                 dev->flags &= ~IFF_UP;
1369                 netpoll_poll_enable(dev);
1370         }
1371
1372         return 0;
1373 }
1374
1375 static int __dev_close(struct net_device *dev)
1376 {
1377         int retval;
1378         LIST_HEAD(single);
1379
1380         list_add(&dev->close_list, &single);
1381         retval = __dev_close_many(&single);
1382         list_del(&single);
1383
1384         return retval;
1385 }
1386
1387 static int dev_close_many(struct list_head *head)
1388 {
1389         struct net_device *dev, *tmp;
1390
1391         /* Remove the devices that don't need to be closed */
1392         list_for_each_entry_safe(dev, tmp, head, close_list)
1393                 if (!(dev->flags & IFF_UP))
1394                         list_del_init(&dev->close_list);
1395
1396         __dev_close_many(head);
1397
1398         list_for_each_entry_safe(dev, tmp, head, close_list) {
1399                 rtmsg_ifinfo(RTM_NEWLINK, dev, IFF_UP|IFF_RUNNING, GFP_KERNEL);
1400                 call_netdevice_notifiers(NETDEV_DOWN, dev);
1401                 list_del_init(&dev->close_list);
1402         }
1403
1404         return 0;
1405 }
1406
1407 /**
1408  *      dev_close - shutdown an interface.
1409  *      @dev: device to shutdown
1410  *
1411  *      This function moves an active device into down state. A
1412  *      %NETDEV_GOING_DOWN is sent to the netdev notifier chain. The device
1413  *      is then deactivated and finally a %NETDEV_DOWN is sent to the notifier
1414  *      chain.
1415  */
1416 int dev_close(struct net_device *dev)
1417 {
1418         if (dev->flags & IFF_UP) {
1419                 LIST_HEAD(single);
1420
1421                 list_add(&dev->close_list, &single);
1422                 dev_close_many(&single);
1423                 list_del(&single);
1424         }
1425         return 0;
1426 }
1427 EXPORT_SYMBOL(dev_close);
1428
1429
1430 /**
1431  *      dev_disable_lro - disable Large Receive Offload on a device
1432  *      @dev: device
1433  *
1434  *      Disable Large Receive Offload (LRO) on a net device.  Must be
1435  *      called under RTNL.  This is needed if received packets may be
1436  *      forwarded to another interface.
1437  */
1438 void dev_disable_lro(struct net_device *dev)
1439 {
1440         /*
1441          * If we're trying to disable lro on a vlan device
1442          * use the underlying physical device instead
1443          */
1444         if (is_vlan_dev(dev))
1445                 dev = vlan_dev_real_dev(dev);
1446
1447         /* the same for macvlan devices */
1448         if (netif_is_macvlan(dev))
1449                 dev = macvlan_dev_real_dev(dev);
1450
1451         dev->wanted_features &= ~NETIF_F_LRO;
1452         netdev_update_features(dev);
1453
1454         if (unlikely(dev->features & NETIF_F_LRO))
1455                 netdev_WARN(dev, "failed to disable LRO!\n");
1456 }
1457 EXPORT_SYMBOL(dev_disable_lro);
1458
1459 static int call_netdevice_notifier(struct notifier_block *nb, unsigned long val,
1460                                    struct net_device *dev)
1461 {
1462         struct netdev_notifier_info info;
1463
1464         netdev_notifier_info_init(&info, dev);
1465         return nb->notifier_call(nb, val, &info);
1466 }
1467
1468 static int dev_boot_phase = 1;
1469
1470 /**
1471  *      register_netdevice_notifier - register a network notifier block
1472  *      @nb: notifier
1473  *
1474  *      Register a notifier to be called when network device events occur.
1475  *      The notifier passed is linked into the kernel structures and must
1476  *      not be reused until it has been unregistered. A negative errno code
1477  *      is returned on a failure.
1478  *
1479  *      When registered all registration and up events are replayed
1480  *      to the new notifier to allow device to have a race free
1481  *      view of the network device list.
1482  */
1483
1484 int register_netdevice_notifier(struct notifier_block *nb)
1485 {
1486         struct net_device *dev;
1487         struct net_device *last;
1488         struct net *net;
1489         int err;
1490
1491         rtnl_lock();
1492         err = raw_notifier_chain_register(&netdev_chain, nb);
1493         if (err)
1494                 goto unlock;
1495         if (dev_boot_phase)
1496                 goto unlock;
1497         for_each_net(net) {
1498                 for_each_netdev(net, dev) {
1499                         err = call_netdevice_notifier(nb, NETDEV_REGISTER, dev);
1500                         err = notifier_to_errno(err);
1501                         if (err)
1502                                 goto rollback;
1503
1504                         if (!(dev->flags & IFF_UP))
1505                                 continue;
1506
1507                         call_netdevice_notifier(nb, NETDEV_UP, dev);
1508                 }
1509         }
1510
1511 unlock:
1512         rtnl_unlock();
1513         return err;
1514
1515 rollback:
1516         last = dev;
1517         for_each_net(net) {
1518                 for_each_netdev(net, dev) {
1519                         if (dev == last)
1520                                 goto outroll;
1521
1522                         if (dev->flags & IFF_UP) {
1523                                 call_netdevice_notifier(nb, NETDEV_GOING_DOWN,
1524                                                         dev);
1525                                 call_netdevice_notifier(nb, NETDEV_DOWN, dev);
1526                         }
1527                         call_netdevice_notifier(nb, NETDEV_UNREGISTER, dev);
1528                 }
1529         }
1530
1531 outroll:
1532         raw_notifier_chain_unregister(&netdev_chain, nb);
1533         goto unlock;
1534 }
1535 EXPORT_SYMBOL(register_netdevice_notifier);
1536
1537 /**
1538  *      unregister_netdevice_notifier - unregister a network notifier block
1539  *      @nb: notifier
1540  *
1541  *      Unregister a notifier previously registered by
1542  *      register_netdevice_notifier(). The notifier is unlinked into the
1543  *      kernel structures and may then be reused. A negative errno code
1544  *      is returned on a failure.
1545  *
1546  *      After unregistering unregister and down device events are synthesized
1547  *      for all devices on the device list to the removed notifier to remove
1548  *      the need for special case cleanup code.
1549  */
1550
1551 int unregister_netdevice_notifier(struct notifier_block *nb)
1552 {
1553         struct net_device *dev;
1554         struct net *net;
1555         int err;
1556
1557         rtnl_lock();
1558         err = raw_notifier_chain_unregister(&netdev_chain, nb);
1559         if (err)
1560                 goto unlock;
1561
1562         for_each_net(net) {
1563                 for_each_netdev(net, dev) {
1564                         if (dev->flags & IFF_UP) {
1565                                 call_netdevice_notifier(nb, NETDEV_GOING_DOWN,
1566                                                         dev);
1567                                 call_netdevice_notifier(nb, NETDEV_DOWN, dev);
1568                         }
1569                         call_netdevice_notifier(nb, NETDEV_UNREGISTER, dev);
1570                 }
1571         }
1572 unlock:
1573         rtnl_unlock();
1574         return err;
1575 }
1576 EXPORT_SYMBOL(unregister_netdevice_notifier);
1577
1578 /**
1579  *      call_netdevice_notifiers_info - call all network notifier blocks
1580  *      @val: value passed unmodified to notifier function
1581  *      @dev: net_device pointer passed unmodified to notifier function
1582  *      @info: notifier information data
1583  *
1584  *      Call all network notifier blocks.  Parameters and return value
1585  *      are as for raw_notifier_call_chain().
1586  */
1587
1588 static int call_netdevice_notifiers_info(unsigned long val,
1589                                          struct net_device *dev,
1590                                          struct netdev_notifier_info *info)
1591 {
1592         ASSERT_RTNL();
1593         netdev_notifier_info_init(info, dev);
1594         return raw_notifier_call_chain(&netdev_chain, val, info);
1595 }
1596
1597 /**
1598  *      call_netdevice_notifiers - call all network notifier blocks
1599  *      @val: value passed unmodified to notifier function
1600  *      @dev: net_device pointer passed unmodified to notifier function
1601  *
1602  *      Call all network notifier blocks.  Parameters and return value
1603  *      are as for raw_notifier_call_chain().
1604  */
1605
1606 int call_netdevice_notifiers(unsigned long val, struct net_device *dev)
1607 {
1608         struct netdev_notifier_info info;
1609
1610         return call_netdevice_notifiers_info(val, dev, &info);
1611 }
1612 EXPORT_SYMBOL(call_netdevice_notifiers);
1613
1614 static struct static_key netstamp_needed __read_mostly;
1615 #ifdef HAVE_JUMP_LABEL
1616 static atomic_t netstamp_needed_deferred;
1617 static void netstamp_clear(struct work_struct *work)
1618 {
1619         int deferred = atomic_xchg(&netstamp_needed_deferred, 0);
1620
1621         while (deferred--)
1622                 static_key_slow_dec(&netstamp_needed);
1623 }
1624 static DECLARE_WORK(netstamp_work, netstamp_clear);
1625 #endif
1626
1627 void net_enable_timestamp(void)
1628 {
1629         static_key_slow_inc(&netstamp_needed);
1630 }
1631 EXPORT_SYMBOL(net_enable_timestamp);
1632
1633 void net_disable_timestamp(void)
1634 {
1635 #ifdef HAVE_JUMP_LABEL
1636         /* net_disable_timestamp() can be called from non process context */
1637         atomic_inc(&netstamp_needed_deferred);
1638         schedule_work(&netstamp_work);
1639 #else
1640         static_key_slow_dec(&netstamp_needed);
1641 #endif
1642 }
1643 EXPORT_SYMBOL(net_disable_timestamp);
1644
1645 static inline void net_timestamp_set(struct sk_buff *skb)
1646 {
1647         skb->tstamp.tv64 = 0;
1648         if (static_key_false(&netstamp_needed))
1649                 __net_timestamp(skb);
1650 }
1651
1652 #define net_timestamp_check(COND, SKB)                  \
1653         if (static_key_false(&netstamp_needed)) {               \
1654                 if ((COND) && !(SKB)->tstamp.tv64)      \
1655                         __net_timestamp(SKB);           \
1656         }                                               \
1657
1658 bool is_skb_forwardable(struct net_device *dev, struct sk_buff *skb)
1659 {
1660         unsigned int len;
1661
1662         if (!(dev->flags & IFF_UP))
1663                 return false;
1664
1665         len = dev->mtu + dev->hard_header_len + VLAN_HLEN;
1666         if (skb->len <= len)
1667                 return true;
1668
1669         /* if TSO is enabled, we don't care about the length as the packet
1670          * could be forwarded without being segmented before
1671          */
1672         if (skb_is_gso(skb))
1673                 return true;
1674
1675         return false;
1676 }
1677 EXPORT_SYMBOL_GPL(is_skb_forwardable);
1678
1679 int __dev_forward_skb(struct net_device *dev, struct sk_buff *skb)
1680 {
1681         if (skb_shinfo(skb)->tx_flags & SKBTX_DEV_ZEROCOPY) {
1682                 if (skb_copy_ubufs(skb, GFP_ATOMIC)) {
1683                         atomic_long_inc(&dev->rx_dropped);
1684                         kfree_skb(skb);
1685                         return NET_RX_DROP;
1686                 }
1687         }
1688
1689         if (unlikely(!is_skb_forwardable(dev, skb))) {
1690                 atomic_long_inc(&dev->rx_dropped);
1691                 kfree_skb(skb);
1692                 return NET_RX_DROP;
1693         }
1694
1695         skb_scrub_packet(skb, true);
1696         skb->protocol = eth_type_trans(skb, dev);
1697         skb_postpull_rcsum(skb, eth_hdr(skb), ETH_HLEN);
1698
1699         return 0;
1700 }
1701 EXPORT_SYMBOL_GPL(__dev_forward_skb);
1702
1703 /**
1704  * dev_forward_skb - loopback an skb to another netif
1705  *
1706  * @dev: destination network device
1707  * @skb: buffer to forward
1708  *
1709  * return values:
1710  *      NET_RX_SUCCESS  (no congestion)
1711  *      NET_RX_DROP     (packet was dropped, but freed)
1712  *
1713  * dev_forward_skb can be used for injecting an skb from the
1714  * start_xmit function of one device into the receive queue
1715  * of another device.
1716  *
1717  * The receiving device may be in another namespace, so
1718  * we have to clear all information in the skb that could
1719  * impact namespace isolation.
1720  */
1721 int dev_forward_skb(struct net_device *dev, struct sk_buff *skb)
1722 {
1723         return __dev_forward_skb(dev, skb) ?: netif_rx_internal(skb);
1724 }
1725 EXPORT_SYMBOL_GPL(dev_forward_skb);
1726
1727 static inline int deliver_skb(struct sk_buff *skb,
1728                               struct packet_type *pt_prev,
1729                               struct net_device *orig_dev)
1730 {
1731         if (unlikely(skb_orphan_frags(skb, GFP_ATOMIC)))
1732                 return -ENOMEM;
1733         atomic_inc(&skb->users);
1734         return pt_prev->func(skb, skb->dev, pt_prev, orig_dev);
1735 }
1736
1737 static inline bool skb_loop_sk(struct packet_type *ptype, struct sk_buff *skb)
1738 {
1739         if (!ptype->af_packet_priv || !skb->sk)
1740                 return false;
1741
1742         if (ptype->id_match)
1743                 return ptype->id_match(ptype, skb->sk);
1744         else if ((struct sock *)ptype->af_packet_priv == skb->sk)
1745                 return true;
1746
1747         return false;
1748 }
1749
1750 /*
1751  *      Support routine. Sends outgoing frames to any network
1752  *      taps currently in use.
1753  */
1754
1755 static void dev_queue_xmit_nit(struct sk_buff *skb, struct net_device *dev)
1756 {
1757         struct packet_type *ptype;
1758         struct sk_buff *skb2 = NULL;
1759         struct packet_type *pt_prev = NULL;
1760
1761         rcu_read_lock();
1762         list_for_each_entry_rcu(ptype, &ptype_all, list) {
1763                 /* Never send packets back to the socket
1764                  * they originated from - MvS (miquels@drinkel.ow.org)
1765                  */
1766                 if ((ptype->dev == dev || !ptype->dev) &&
1767                     (!skb_loop_sk(ptype, skb))) {
1768                         if (pt_prev) {
1769                                 deliver_skb(skb2, pt_prev, skb->dev);
1770                                 pt_prev = ptype;
1771                                 continue;
1772                         }
1773
1774                         skb2 = skb_clone(skb, GFP_ATOMIC);
1775                         if (!skb2)
1776                                 break;
1777
1778                         net_timestamp_set(skb2);
1779
1780                         /* skb->nh should be correctly
1781                            set by sender, so that the second statement is
1782                            just protection against buggy protocols.
1783                          */
1784                         skb_reset_mac_header(skb2);
1785
1786                         if (skb_network_header(skb2) < skb2->data ||
1787                             skb_network_header(skb2) > skb_tail_pointer(skb2)) {
1788                                 net_crit_ratelimited("protocol %04x is buggy, dev %s\n",
1789                                                      ntohs(skb2->protocol),
1790                                                      dev->name);
1791                                 skb_reset_network_header(skb2);
1792                         }
1793
1794                         skb2->transport_header = skb2->network_header;
1795                         skb2->pkt_type = PACKET_OUTGOING;
1796                         pt_prev = ptype;
1797                 }
1798         }
1799         if (pt_prev)
1800                 pt_prev->func(skb2, skb->dev, pt_prev, skb->dev);
1801         rcu_read_unlock();
1802 }
1803
1804 /**
1805  * netif_setup_tc - Handle tc mappings on real_num_tx_queues change
1806  * @dev: Network device
1807  * @txq: number of queues available
1808  *
1809  * If real_num_tx_queues is changed the tc mappings may no longer be
1810  * valid. To resolve this verify the tc mapping remains valid and if
1811  * not NULL the mapping. With no priorities mapping to this
1812  * offset/count pair it will no longer be used. In the worst case TC0
1813  * is invalid nothing can be done so disable priority mappings. If is
1814  * expected that drivers will fix this mapping if they can before
1815  * calling netif_set_real_num_tx_queues.
1816  */
1817 static void netif_setup_tc(struct net_device *dev, unsigned int txq)
1818 {
1819         int i;
1820         struct netdev_tc_txq *tc = &dev->tc_to_txq[0];
1821
1822         /* If TC0 is invalidated disable TC mapping */
1823         if (tc->offset + tc->count > txq) {
1824                 pr_warn("Number of in use tx queues changed invalidating tc mappings. Priority traffic classification disabled!\n");
1825                 dev->num_tc = 0;
1826                 return;
1827         }
1828
1829         /* Invalidated prio to tc mappings set to TC0 */
1830         for (i = 1; i < TC_BITMASK + 1; i++) {
1831                 int q = netdev_get_prio_tc_map(dev, i);
1832
1833                 tc = &dev->tc_to_txq[q];
1834                 if (tc->offset + tc->count > txq) {
1835                         pr_warn("Number of in use tx queues changed. Priority %i to tc mapping %i is no longer valid. Setting map to 0\n",
1836                                 i, q);
1837                         netdev_set_prio_tc_map(dev, i, 0);
1838                 }
1839         }
1840 }
1841
1842 #ifdef CONFIG_XPS
1843 static DEFINE_MUTEX(xps_map_mutex);
1844 #define xmap_dereference(P)             \
1845         rcu_dereference_protected((P), lockdep_is_held(&xps_map_mutex))
1846
1847 static struct xps_map *remove_xps_queue(struct xps_dev_maps *dev_maps,
1848                                         int cpu, u16 index)
1849 {
1850         struct xps_map *map = NULL;
1851         int pos;
1852
1853         if (dev_maps)
1854                 map = xmap_dereference(dev_maps->cpu_map[cpu]);
1855
1856         for (pos = 0; map && pos < map->len; pos++) {
1857                 if (map->queues[pos] == index) {
1858                         if (map->len > 1) {
1859                                 map->queues[pos] = map->queues[--map->len];
1860                         } else {
1861                                 RCU_INIT_POINTER(dev_maps->cpu_map[cpu], NULL);
1862                                 kfree_rcu(map, rcu);
1863                                 map = NULL;
1864                         }
1865                         break;
1866                 }
1867         }
1868
1869         return map;
1870 }
1871
1872 static void netif_reset_xps_queues_gt(struct net_device *dev, u16 index)
1873 {
1874         struct xps_dev_maps *dev_maps;
1875         int cpu, i;
1876         bool active = false;
1877
1878         mutex_lock(&xps_map_mutex);
1879         dev_maps = xmap_dereference(dev->xps_maps);
1880
1881         if (!dev_maps)
1882                 goto out_no_maps;
1883
1884         for_each_possible_cpu(cpu) {
1885                 for (i = index; i < dev->num_tx_queues; i++) {
1886                         if (!remove_xps_queue(dev_maps, cpu, i))
1887                                 break;
1888                 }
1889                 if (i == dev->num_tx_queues)
1890                         active = true;
1891         }
1892
1893         if (!active) {
1894                 RCU_INIT_POINTER(dev->xps_maps, NULL);
1895                 kfree_rcu(dev_maps, rcu);
1896         }
1897
1898         for (i = index; i < dev->num_tx_queues; i++)
1899                 netdev_queue_numa_node_write(netdev_get_tx_queue(dev, i),
1900                                              NUMA_NO_NODE);
1901
1902 out_no_maps:
1903         mutex_unlock(&xps_map_mutex);
1904 }
1905
1906 static struct xps_map *expand_xps_map(struct xps_map *map,
1907                                       int cpu, u16 index)
1908 {
1909         struct xps_map *new_map;
1910         int alloc_len = XPS_MIN_MAP_ALLOC;
1911         int i, pos;
1912
1913         for (pos = 0; map && pos < map->len; pos++) {
1914                 if (map->queues[pos] != index)
1915                         continue;
1916                 return map;
1917         }
1918
1919         /* Need to add queue to this CPU's existing map */
1920         if (map) {
1921                 if (pos < map->alloc_len)
1922                         return map;
1923
1924                 alloc_len = map->alloc_len * 2;
1925         }
1926
1927         /* Need to allocate new map to store queue on this CPU's map */
1928         new_map = kzalloc_node(XPS_MAP_SIZE(alloc_len), GFP_KERNEL,
1929                                cpu_to_node(cpu));
1930         if (!new_map)
1931                 return NULL;
1932
1933         for (i = 0; i < pos; i++)
1934                 new_map->queues[i] = map->queues[i];
1935         new_map->alloc_len = alloc_len;
1936         new_map->len = pos;
1937
1938         return new_map;
1939 }
1940
1941 int netif_set_xps_queue(struct net_device *dev, const struct cpumask *mask,
1942                         u16 index)
1943 {
1944         struct xps_dev_maps *dev_maps, *new_dev_maps = NULL;
1945         struct xps_map *map, *new_map;
1946         int maps_sz = max_t(unsigned int, XPS_DEV_MAPS_SIZE, L1_CACHE_BYTES);
1947         int cpu, numa_node_id = -2;
1948         bool active = false;
1949
1950         mutex_lock(&xps_map_mutex);
1951
1952         dev_maps = xmap_dereference(dev->xps_maps);
1953
1954         /* allocate memory for queue storage */
1955         for_each_online_cpu(cpu) {
1956                 if (!cpumask_test_cpu(cpu, mask))
1957                         continue;
1958
1959                 if (!new_dev_maps)
1960                         new_dev_maps = kzalloc(maps_sz, GFP_KERNEL);
1961                 if (!new_dev_maps) {
1962                         mutex_unlock(&xps_map_mutex);
1963                         return -ENOMEM;
1964                 }
1965
1966                 map = dev_maps ? xmap_dereference(dev_maps->cpu_map[cpu]) :
1967                                  NULL;
1968
1969                 map = expand_xps_map(map, cpu, index);
1970                 if (!map)
1971                         goto error;
1972
1973                 RCU_INIT_POINTER(new_dev_maps->cpu_map[cpu], map);
1974         }
1975
1976         if (!new_dev_maps)
1977                 goto out_no_new_maps;
1978
1979         for_each_possible_cpu(cpu) {
1980                 if (cpumask_test_cpu(cpu, mask) && cpu_online(cpu)) {
1981                         /* add queue to CPU maps */
1982                         int pos = 0;
1983
1984                         map = xmap_dereference(new_dev_maps->cpu_map[cpu]);
1985                         while ((pos < map->len) && (map->queues[pos] != index))
1986                                 pos++;
1987
1988                         if (pos == map->len)
1989                                 map->queues[map->len++] = index;
1990 #ifdef CONFIG_NUMA
1991                         if (numa_node_id == -2)
1992                                 numa_node_id = cpu_to_node(cpu);
1993                         else if (numa_node_id != cpu_to_node(cpu))
1994                                 numa_node_id = -1;
1995 #endif
1996                 } else if (dev_maps) {
1997                         /* fill in the new device map from the old device map */
1998                         map = xmap_dereference(dev_maps->cpu_map[cpu]);
1999                         RCU_INIT_POINTER(new_dev_maps->cpu_map[cpu], map);
2000                 }
2001
2002         }
2003
2004         rcu_assign_pointer(dev->xps_maps, new_dev_maps);
2005
2006         /* Cleanup old maps */
2007         if (dev_maps) {
2008                 for_each_possible_cpu(cpu) {
2009                         new_map = xmap_dereference(new_dev_maps->cpu_map[cpu]);
2010                         map = xmap_dereference(dev_maps->cpu_map[cpu]);
2011                         if (map && map != new_map)
2012                                 kfree_rcu(map, rcu);
2013                 }
2014
2015                 kfree_rcu(dev_maps, rcu);
2016         }
2017
2018         dev_maps = new_dev_maps;
2019         active = true;
2020
2021 out_no_new_maps:
2022         /* update Tx queue numa node */
2023         netdev_queue_numa_node_write(netdev_get_tx_queue(dev, index),
2024                                      (numa_node_id >= 0) ? numa_node_id :
2025                                      NUMA_NO_NODE);
2026
2027         if (!dev_maps)
2028                 goto out_no_maps;
2029
2030         /* removes queue from unused CPUs */
2031         for_each_possible_cpu(cpu) {
2032                 if (cpumask_test_cpu(cpu, mask) && cpu_online(cpu))
2033                         continue;
2034
2035                 if (remove_xps_queue(dev_maps, cpu, index))
2036                         active = true;
2037         }
2038
2039         /* free map if not active */
2040         if (!active) {
2041                 RCU_INIT_POINTER(dev->xps_maps, NULL);
2042                 kfree_rcu(dev_maps, rcu);
2043         }
2044
2045 out_no_maps:
2046         mutex_unlock(&xps_map_mutex);
2047
2048         return 0;
2049 error:
2050         /* remove any maps that we added */
2051         for_each_possible_cpu(cpu) {
2052                 new_map = xmap_dereference(new_dev_maps->cpu_map[cpu]);
2053                 map = dev_maps ? xmap_dereference(dev_maps->cpu_map[cpu]) :
2054                                  NULL;
2055                 if (new_map && new_map != map)
2056                         kfree(new_map);
2057         }
2058
2059         mutex_unlock(&xps_map_mutex);
2060
2061         kfree(new_dev_maps);
2062         return -ENOMEM;
2063 }
2064 EXPORT_SYMBOL(netif_set_xps_queue);
2065
2066 #endif
2067 /*
2068  * Routine to help set real_num_tx_queues. To avoid skbs mapped to queues
2069  * greater then real_num_tx_queues stale skbs on the qdisc must be flushed.
2070  */
2071 int netif_set_real_num_tx_queues(struct net_device *dev, unsigned int txq)
2072 {
2073         int rc;
2074
2075         if (txq < 1 || txq > dev->num_tx_queues)
2076                 return -EINVAL;
2077
2078         if (dev->reg_state == NETREG_REGISTERED ||
2079             dev->reg_state == NETREG_UNREGISTERING) {
2080                 ASSERT_RTNL();
2081
2082                 rc = netdev_queue_update_kobjects(dev, dev->real_num_tx_queues,
2083                                                   txq);
2084                 if (rc)
2085                         return rc;
2086
2087                 if (dev->num_tc)
2088                         netif_setup_tc(dev, txq);
2089
2090                 if (txq < dev->real_num_tx_queues) {
2091                         qdisc_reset_all_tx_gt(dev, txq);
2092 #ifdef CONFIG_XPS
2093                         netif_reset_xps_queues_gt(dev, txq);
2094 #endif
2095                 }
2096         }
2097
2098         dev->real_num_tx_queues = txq;
2099         return 0;
2100 }
2101 EXPORT_SYMBOL(netif_set_real_num_tx_queues);
2102
2103 #ifdef CONFIG_SYSFS
2104 /**
2105  *      netif_set_real_num_rx_queues - set actual number of RX queues used
2106  *      @dev: Network device
2107  *      @rxq: Actual number of RX queues
2108  *
2109  *      This must be called either with the rtnl_lock held or before
2110  *      registration of the net device.  Returns 0 on success, or a
2111  *      negative error code.  If called before registration, it always
2112  *      succeeds.
2113  */
2114 int netif_set_real_num_rx_queues(struct net_device *dev, unsigned int rxq)
2115 {
2116         int rc;
2117
2118         if (rxq < 1 || rxq > dev->num_rx_queues)
2119                 return -EINVAL;
2120
2121         if (dev->reg_state == NETREG_REGISTERED) {
2122                 ASSERT_RTNL();
2123
2124                 rc = net_rx_queue_update_kobjects(dev, dev->real_num_rx_queues,
2125                                                   rxq);
2126                 if (rc)
2127                         return rc;
2128         }
2129
2130         dev->real_num_rx_queues = rxq;
2131         return 0;
2132 }
2133 EXPORT_SYMBOL(netif_set_real_num_rx_queues);
2134 #endif
2135
2136 /**
2137  * netif_get_num_default_rss_queues - default number of RSS queues
2138  *
2139  * This routine should set an upper limit on the number of RSS queues
2140  * used by default by multiqueue devices.
2141  */
2142 int netif_get_num_default_rss_queues(void)
2143 {
2144         return min_t(int, DEFAULT_MAX_NUM_RSS_QUEUES, num_online_cpus());
2145 }
2146 EXPORT_SYMBOL(netif_get_num_default_rss_queues);
2147
2148 static inline void __netif_reschedule(struct Qdisc *q)
2149 {
2150         struct softnet_data *sd;
2151         unsigned long flags;
2152
2153         local_irq_save(flags);
2154         sd = this_cpu_ptr(&softnet_data);
2155         q->next_sched = NULL;
2156         *sd->output_queue_tailp = q;
2157         sd->output_queue_tailp = &q->next_sched;
2158         raise_softirq_irqoff(NET_TX_SOFTIRQ);
2159         local_irq_restore(flags);
2160 }
2161
2162 void __netif_schedule(struct Qdisc *q)
2163 {
2164         if (!test_and_set_bit(__QDISC_STATE_SCHED, &q->state))
2165                 __netif_reschedule(q);
2166 }
2167 EXPORT_SYMBOL(__netif_schedule);
2168
2169 struct dev_kfree_skb_cb {
2170         enum skb_free_reason reason;
2171 };
2172
2173 static struct dev_kfree_skb_cb *get_kfree_skb_cb(const struct sk_buff *skb)
2174 {
2175         return (struct dev_kfree_skb_cb *)skb->cb;
2176 }
2177
2178 void netif_schedule_queue(struct netdev_queue *txq)
2179 {
2180         rcu_read_lock();
2181         if (!(txq->state & QUEUE_STATE_ANY_XOFF)) {
2182                 struct Qdisc *q = rcu_dereference(txq->qdisc);
2183
2184                 __netif_schedule(q);
2185         }
2186         rcu_read_unlock();
2187 }
2188 EXPORT_SYMBOL(netif_schedule_queue);
2189
2190 /**
2191  *      netif_wake_subqueue - allow sending packets on subqueue
2192  *      @dev: network device
2193  *      @queue_index: sub queue index
2194  *
2195  * Resume individual transmit queue of a device with multiple transmit queues.
2196  */
2197 void netif_wake_subqueue(struct net_device *dev, u16 queue_index)
2198 {
2199         struct netdev_queue *txq = netdev_get_tx_queue(dev, queue_index);
2200
2201         if (test_and_clear_bit(__QUEUE_STATE_DRV_XOFF, &txq->state)) {
2202                 struct Qdisc *q;
2203
2204                 rcu_read_lock();
2205                 q = rcu_dereference(txq->qdisc);
2206                 __netif_schedule(q);
2207                 rcu_read_unlock();
2208         }
2209 }
2210 EXPORT_SYMBOL(netif_wake_subqueue);
2211
2212 void netif_tx_wake_queue(struct netdev_queue *dev_queue)
2213 {
2214         if (test_and_clear_bit(__QUEUE_STATE_DRV_XOFF, &dev_queue->state)) {
2215                 struct Qdisc *q;
2216
2217                 rcu_read_lock();
2218                 q = rcu_dereference(dev_queue->qdisc);
2219                 __netif_schedule(q);
2220                 rcu_read_unlock();
2221         }
2222 }
2223 EXPORT_SYMBOL(netif_tx_wake_queue);
2224
2225 void __dev_kfree_skb_irq(struct sk_buff *skb, enum skb_free_reason reason)
2226 {
2227         unsigned long flags;
2228
2229         if (unlikely(!skb))
2230                 return;
2231
2232         if (likely(atomic_read(&skb->users) == 1)) {
2233                 smp_rmb();
2234                 atomic_set(&skb->users, 0);
2235         } else if (likely(!atomic_dec_and_test(&skb->users))) {
2236                 return;
2237         }
2238         get_kfree_skb_cb(skb)->reason = reason;
2239         local_irq_save(flags);
2240         skb->next = __this_cpu_read(softnet_data.completion_queue);
2241         __this_cpu_write(softnet_data.completion_queue, skb);
2242         raise_softirq_irqoff(NET_TX_SOFTIRQ);
2243         local_irq_restore(flags);
2244 }
2245 EXPORT_SYMBOL(__dev_kfree_skb_irq);
2246
2247 void __dev_kfree_skb_any(struct sk_buff *skb, enum skb_free_reason reason)
2248 {
2249         if (in_irq() || irqs_disabled())
2250                 __dev_kfree_skb_irq(skb, reason);
2251         else
2252                 dev_kfree_skb(skb);
2253 }
2254 EXPORT_SYMBOL(__dev_kfree_skb_any);
2255
2256
2257 /**
2258  * netif_device_detach - mark device as removed
2259  * @dev: network device
2260  *
2261  * Mark device as removed from system and therefore no longer available.
2262  */
2263 void netif_device_detach(struct net_device *dev)
2264 {
2265         if (test_and_clear_bit(__LINK_STATE_PRESENT, &dev->state) &&
2266             netif_running(dev)) {
2267                 netif_tx_stop_all_queues(dev);
2268         }
2269 }
2270 EXPORT_SYMBOL(netif_device_detach);
2271
2272 /**
2273  * netif_device_attach - mark device as attached
2274  * @dev: network device
2275  *
2276  * Mark device as attached from system and restart if needed.
2277  */
2278 void netif_device_attach(struct net_device *dev)
2279 {
2280         if (!test_and_set_bit(__LINK_STATE_PRESENT, &dev->state) &&
2281             netif_running(dev)) {
2282                 netif_tx_wake_all_queues(dev);
2283                 __netdev_watchdog_up(dev);
2284         }
2285 }
2286 EXPORT_SYMBOL(netif_device_attach);
2287
2288 static void skb_warn_bad_offload(const struct sk_buff *skb)
2289 {
2290         static const netdev_features_t null_features = 0;
2291         struct net_device *dev = skb->dev;
2292         const char *driver = "";
2293
2294         if (!net_ratelimit())
2295                 return;
2296
2297         if (dev && dev->dev.parent)
2298                 driver = dev_driver_string(dev->dev.parent);
2299
2300         WARN(1, "%s: caps=(%pNF, %pNF) len=%d data_len=%d gso_size=%d "
2301              "gso_type=%d ip_summed=%d\n",
2302              driver, dev ? &dev->features : &null_features,
2303              skb->sk ? &skb->sk->sk_route_caps : &null_features,
2304              skb->len, skb->data_len, skb_shinfo(skb)->gso_size,
2305              skb_shinfo(skb)->gso_type, skb->ip_summed);
2306 }
2307
2308 /*
2309  * Invalidate hardware checksum when packet is to be mangled, and
2310  * complete checksum manually on outgoing path.
2311  */
2312 int skb_checksum_help(struct sk_buff *skb)
2313 {
2314         __wsum csum;
2315         int ret = 0, offset;
2316
2317         if (skb->ip_summed == CHECKSUM_COMPLETE)
2318                 goto out_set_summed;
2319
2320         if (unlikely(skb_shinfo(skb)->gso_size)) {
2321                 skb_warn_bad_offload(skb);
2322                 return -EINVAL;
2323         }
2324
2325         /* Before computing a checksum, we should make sure no frag could
2326          * be modified by an external entity : checksum could be wrong.
2327          */
2328         if (skb_has_shared_frag(skb)) {
2329                 ret = __skb_linearize(skb);
2330                 if (ret)
2331                         goto out;
2332         }
2333
2334         offset = skb_checksum_start_offset(skb);
2335         BUG_ON(offset >= skb_headlen(skb));
2336         csum = skb_checksum(skb, offset, skb->len - offset, 0);
2337
2338         offset += skb->csum_offset;
2339         BUG_ON(offset + sizeof(__sum16) > skb_headlen(skb));
2340
2341         if (skb_cloned(skb) &&
2342             !skb_clone_writable(skb, offset + sizeof(__sum16))) {
2343                 ret = pskb_expand_head(skb, 0, 0, GFP_ATOMIC);
2344                 if (ret)
2345                         goto out;
2346         }
2347
2348         *(__sum16 *)(skb->data + offset) = csum_fold(csum);
2349 out_set_summed:
2350         skb->ip_summed = CHECKSUM_NONE;
2351 out:
2352         return ret;
2353 }
2354 EXPORT_SYMBOL(skb_checksum_help);
2355
2356 __be16 skb_network_protocol(struct sk_buff *skb, int *depth)
2357 {
2358         unsigned int vlan_depth = skb->mac_len;
2359         __be16 type = skb->protocol;
2360
2361         /* Tunnel gso handlers can set protocol to ethernet. */
2362         if (type == htons(ETH_P_TEB)) {
2363                 struct ethhdr *eth;
2364
2365                 if (unlikely(!pskb_may_pull(skb, sizeof(struct ethhdr))))
2366                         return 0;
2367
2368                 eth = (struct ethhdr *)skb_mac_header(skb);
2369                 type = eth->h_proto;
2370         }
2371
2372         /* if skb->protocol is 802.1Q/AD then the header should already be
2373          * present at mac_len - VLAN_HLEN (if mac_len > 0), or at
2374          * ETH_HLEN otherwise
2375          */
2376         if (type == htons(ETH_P_8021Q) || type == htons(ETH_P_8021AD)) {
2377                 if (vlan_depth) {
2378                         if (WARN_ON(vlan_depth < VLAN_HLEN))
2379                                 return 0;
2380                         vlan_depth -= VLAN_HLEN;
2381                 } else {
2382                         vlan_depth = ETH_HLEN;
2383                 }
2384                 do {
2385                         struct vlan_hdr *vh;
2386
2387                         if (unlikely(!pskb_may_pull(skb,
2388                                                     vlan_depth + VLAN_HLEN)))
2389                                 return 0;
2390
2391                         vh = (struct vlan_hdr *)(skb->data + vlan_depth);
2392                         type = vh->h_vlan_encapsulated_proto;
2393                         vlan_depth += VLAN_HLEN;
2394                 } while (type == htons(ETH_P_8021Q) ||
2395                          type == htons(ETH_P_8021AD));
2396         }
2397
2398         *depth = vlan_depth;
2399
2400         return type;
2401 }
2402
2403 /**
2404  *      skb_mac_gso_segment - mac layer segmentation handler.
2405  *      @skb: buffer to segment
2406  *      @features: features for the output path (see dev->features)
2407  */
2408 struct sk_buff *skb_mac_gso_segment(struct sk_buff *skb,
2409                                     netdev_features_t features)
2410 {
2411         struct sk_buff *segs = ERR_PTR(-EPROTONOSUPPORT);
2412         struct packet_offload *ptype;
2413         int vlan_depth = skb->mac_len;
2414         __be16 type = skb_network_protocol(skb, &vlan_depth);
2415
2416         if (unlikely(!type))
2417                 return ERR_PTR(-EINVAL);
2418
2419         __skb_pull(skb, vlan_depth);
2420
2421         rcu_read_lock();
2422         list_for_each_entry_rcu(ptype, &offload_base, list) {
2423                 if (ptype->type == type && ptype->callbacks.gso_segment) {
2424                         segs = ptype->callbacks.gso_segment(skb, features);
2425                         break;
2426                 }
2427         }
2428         rcu_read_unlock();
2429
2430         __skb_push(skb, skb->data - skb_mac_header(skb));
2431
2432         return segs;
2433 }
2434 EXPORT_SYMBOL(skb_mac_gso_segment);
2435
2436
2437 /* openvswitch calls this on rx path, so we need a different check.
2438  */
2439 static inline bool skb_needs_check(struct sk_buff *skb, bool tx_path)
2440 {
2441         if (tx_path)
2442                 return skb->ip_summed != CHECKSUM_PARTIAL &&
2443                        skb->ip_summed != CHECKSUM_UNNECESSARY;
2444
2445         return skb->ip_summed == CHECKSUM_NONE;
2446 }
2447
2448 /**
2449  *      __skb_gso_segment - Perform segmentation on skb.
2450  *      @skb: buffer to segment
2451  *      @features: features for the output path (see dev->features)
2452  *      @tx_path: whether it is called in TX path
2453  *
2454  *      This function segments the given skb and returns a list of segments.
2455  *
2456  *      It may return NULL if the skb requires no segmentation.  This is
2457  *      only possible when GSO is used for verifying header integrity.
2458  */
2459 struct sk_buff *__skb_gso_segment(struct sk_buff *skb,
2460                                   netdev_features_t features, bool tx_path)
2461 {
2462         struct sk_buff *segs;
2463
2464         if (unlikely(skb_needs_check(skb, tx_path))) {
2465                 int err;
2466
2467                 /* We're going to init ->check field in TCP or UDP header */
2468                 err = skb_cow_head(skb, 0);
2469                 if (err < 0)
2470                         return ERR_PTR(err);
2471         }
2472
2473         SKB_GSO_CB(skb)->mac_offset = skb_headroom(skb);
2474         SKB_GSO_CB(skb)->encap_level = 0;
2475
2476         skb_reset_mac_header(skb);
2477         skb_reset_mac_len(skb);
2478
2479         segs = skb_mac_gso_segment(skb, features);
2480
2481         if (unlikely(skb_needs_check(skb, tx_path)))
2482                 skb_warn_bad_offload(skb);
2483
2484         return segs;
2485 }
2486 EXPORT_SYMBOL(__skb_gso_segment);
2487
2488 /* Take action when hardware reception checksum errors are detected. */
2489 #ifdef CONFIG_BUG
2490 void netdev_rx_csum_fault(struct net_device *dev)
2491 {
2492         if (net_ratelimit()) {
2493                 pr_err("%s: hw csum failure\n", dev ? dev->name : "<unknown>");
2494                 dump_stack();
2495         }
2496 }
2497 EXPORT_SYMBOL(netdev_rx_csum_fault);
2498 #endif
2499
2500 /* Actually, we should eliminate this check as soon as we know, that:
2501  * 1. IOMMU is present and allows to map all the memory.
2502  * 2. No high memory really exists on this machine.
2503  */
2504
2505 static int illegal_highdma(struct net_device *dev, struct sk_buff *skb)
2506 {
2507 #ifdef CONFIG_HIGHMEM
2508         int i;
2509         if (!(dev->features & NETIF_F_HIGHDMA)) {
2510                 for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
2511                         skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
2512                         if (PageHighMem(skb_frag_page(frag)))
2513                                 return 1;
2514                 }
2515         }
2516
2517         if (PCI_DMA_BUS_IS_PHYS) {
2518                 struct device *pdev = dev->dev.parent;
2519
2520                 if (!pdev)
2521                         return 0;
2522                 for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
2523                         skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
2524                         dma_addr_t addr = page_to_phys(skb_frag_page(frag));
2525                         if (!pdev->dma_mask || addr + PAGE_SIZE - 1 > *pdev->dma_mask)
2526                                 return 1;
2527                 }
2528         }
2529 #endif
2530         return 0;
2531 }
2532
2533 /* If MPLS offload request, verify we are testing hardware MPLS features
2534  * instead of standard features for the netdev.
2535  */
2536 #ifdef CONFIG_NET_MPLS_GSO
2537 static netdev_features_t net_mpls_features(struct sk_buff *skb,
2538                                            netdev_features_t features,
2539                                            __be16 type)
2540 {
2541         if (type == htons(ETH_P_MPLS_UC) || type == htons(ETH_P_MPLS_MC))
2542                 features &= skb->dev->mpls_features;
2543
2544         return features;
2545 }
2546 #else
2547 static netdev_features_t net_mpls_features(struct sk_buff *skb,
2548                                            netdev_features_t features,
2549                                            __be16 type)
2550 {
2551         return features;
2552 }
2553 #endif
2554
2555 static netdev_features_t harmonize_features(struct sk_buff *skb,
2556         netdev_features_t features)
2557 {
2558         int tmp;
2559         __be16 type;
2560
2561         type = skb_network_protocol(skb, &tmp);
2562         features = net_mpls_features(skb, features, type);
2563
2564         if (skb->ip_summed != CHECKSUM_NONE &&
2565             !can_checksum_protocol(features, type)) {
2566                 features &= ~NETIF_F_ALL_CSUM;
2567         } else if (illegal_highdma(skb->dev, skb)) {
2568                 features &= ~NETIF_F_SG;
2569         }
2570
2571         return features;
2572 }
2573
2574 netdev_features_t netif_skb_features(struct sk_buff *skb)
2575 {
2576         struct net_device *dev = skb->dev;
2577         netdev_features_t features = dev->features;
2578         u16 gso_segs = skb_shinfo(skb)->gso_segs;
2579         __be16 protocol = skb->protocol;
2580
2581         if (gso_segs > dev->gso_max_segs || gso_segs < dev->gso_min_segs)
2582                 features &= ~NETIF_F_GSO_MASK;
2583
2584         /* If encapsulation offload request, verify we are testing
2585          * hardware encapsulation features instead of standard
2586          * features for the netdev
2587          */
2588         if (skb->encapsulation)
2589                 features &= dev->hw_enc_features;
2590
2591         if (!vlan_tx_tag_present(skb)) {
2592                 if (unlikely(protocol == htons(ETH_P_8021Q) ||
2593                              protocol == htons(ETH_P_8021AD))) {
2594                         struct vlan_ethhdr *veh = (struct vlan_ethhdr *)skb->data;
2595                         protocol = veh->h_vlan_encapsulated_proto;
2596                 } else {
2597                         goto finalize;
2598                 }
2599         }
2600
2601         features = netdev_intersect_features(features,
2602                                              dev->vlan_features |
2603                                              NETIF_F_HW_VLAN_CTAG_TX |
2604                                              NETIF_F_HW_VLAN_STAG_TX);
2605
2606         if (protocol == htons(ETH_P_8021Q) || protocol == htons(ETH_P_8021AD))
2607                 features = netdev_intersect_features(features,
2608                                                      NETIF_F_SG |
2609                                                      NETIF_F_HIGHDMA |
2610                                                      NETIF_F_FRAGLIST |
2611                                                      NETIF_F_GEN_CSUM |
2612                                                      NETIF_F_HW_VLAN_CTAG_TX |
2613                                                      NETIF_F_HW_VLAN_STAG_TX);
2614
2615 finalize:
2616         if (dev->netdev_ops->ndo_features_check)
2617                 features &= dev->netdev_ops->ndo_features_check(skb, dev,
2618                                                                 features);
2619
2620         return harmonize_features(skb, features);
2621 }
2622 EXPORT_SYMBOL(netif_skb_features);
2623
2624 static int xmit_one(struct sk_buff *skb, struct net_device *dev,
2625                     struct netdev_queue *txq, bool more)
2626 {
2627         unsigned int len;
2628         int rc;
2629
2630         if (!list_empty(&ptype_all))
2631                 dev_queue_xmit_nit(skb, dev);
2632
2633         len = skb->len;
2634         trace_net_dev_start_xmit(skb, dev);
2635         rc = netdev_start_xmit(skb, dev, txq, more);
2636         trace_net_dev_xmit(skb, rc, dev, len);
2637
2638         return rc;
2639 }
2640
2641 struct sk_buff *dev_hard_start_xmit(struct sk_buff *first, struct net_device *dev,
2642                                     struct netdev_queue *txq, int *ret)
2643 {
2644         struct sk_buff *skb = first;
2645         int rc = NETDEV_TX_OK;
2646
2647         while (skb) {
2648                 struct sk_buff *next = skb->next;
2649
2650                 skb->next = NULL;
2651                 rc = xmit_one(skb, dev, txq, next != NULL);
2652                 if (unlikely(!dev_xmit_complete(rc))) {
2653                         skb->next = next;
2654                         goto out;
2655                 }
2656
2657                 skb = next;
2658                 if (netif_xmit_stopped(txq) && skb) {
2659                         rc = NETDEV_TX_BUSY;
2660                         break;
2661                 }
2662         }
2663
2664 out:
2665         *ret = rc;
2666         return skb;
2667 }
2668
2669 static struct sk_buff *validate_xmit_vlan(struct sk_buff *skb,
2670                                           netdev_features_t features)
2671 {
2672         if (vlan_tx_tag_present(skb) &&
2673             !vlan_hw_offload_capable(features, skb->vlan_proto))
2674                 skb = __vlan_hwaccel_push_inside(skb);
2675         return skb;
2676 }
2677
2678 static struct sk_buff *validate_xmit_skb(struct sk_buff *skb, struct net_device *dev)
2679 {
2680         netdev_features_t features;
2681
2682         if (skb->next)
2683                 return skb;
2684
2685         features = netif_skb_features(skb);
2686         skb = validate_xmit_vlan(skb, features);
2687         if (unlikely(!skb))
2688                 goto out_null;
2689
2690         if (netif_needs_gso(dev, skb, features)) {
2691                 struct sk_buff *segs;
2692
2693                 segs = skb_gso_segment(skb, features);
2694                 if (IS_ERR(segs)) {
2695                         goto out_kfree_skb;
2696                 } else if (segs) {
2697                         consume_skb(skb);
2698                         skb = segs;
2699                 }
2700         } else {
2701                 if (skb_needs_linearize(skb, features) &&
2702                     __skb_linearize(skb))
2703                         goto out_kfree_skb;
2704
2705                 /* If packet is not checksummed and device does not
2706                  * support checksumming for this protocol, complete
2707                  * checksumming here.
2708                  */
2709                 if (skb->ip_summed == CHECKSUM_PARTIAL) {
2710                         if (skb->encapsulation)
2711                                 skb_set_inner_transport_header(skb,
2712                                                                skb_checksum_start_offset(skb));
2713                         else
2714                                 skb_set_transport_header(skb,
2715                                                          skb_checksum_start_offset(skb));
2716                         if (!(features & NETIF_F_ALL_CSUM) &&
2717                             skb_checksum_help(skb))
2718                                 goto out_kfree_skb;
2719                 }
2720         }
2721
2722         return skb;
2723
2724 out_kfree_skb:
2725         kfree_skb(skb);
2726 out_null:
2727         return NULL;
2728 }
2729
2730 struct sk_buff *validate_xmit_skb_list(struct sk_buff *skb, struct net_device *dev)
2731 {
2732         struct sk_buff *next, *head = NULL, *tail;
2733
2734         for (; skb != NULL; skb = next) {
2735                 next = skb->next;
2736                 skb->next = NULL;
2737
2738                 /* in case skb wont be segmented, point to itself */
2739                 skb->prev = skb;
2740
2741                 skb = validate_xmit_skb(skb, dev);
2742                 if (!skb)
2743                         continue;
2744
2745                 if (!head)
2746                         head = skb;
2747                 else
2748                         tail->next = skb;
2749                 /* If skb was segmented, skb->prev points to
2750                  * the last segment. If not, it still contains skb.
2751                  */
2752                 tail = skb->prev;
2753         }
2754         return head;
2755 }
2756 EXPORT_SYMBOL_GPL(validate_xmit_skb_list);
2757
2758 static void qdisc_pkt_len_init(struct sk_buff *skb)
2759 {
2760         const struct skb_shared_info *shinfo = skb_shinfo(skb);
2761
2762         qdisc_skb_cb(skb)->pkt_len = skb->len;
2763
2764         /* To get more precise estimation of bytes sent on wire,
2765          * we add to pkt_len the headers size of all segments
2766          */
2767         if (shinfo->gso_size)  {
2768                 unsigned int hdr_len;
2769                 u16 gso_segs = shinfo->gso_segs;
2770
2771                 /* mac layer + network layer */
2772                 hdr_len = skb_transport_header(skb) - skb_mac_header(skb);
2773
2774                 /* + transport layer */
2775                 if (likely(shinfo->gso_type & (SKB_GSO_TCPV4 | SKB_GSO_TCPV6))) {
2776                         const struct tcphdr *th;
2777                         struct tcphdr _tcphdr;
2778
2779                         th = skb_header_pointer(skb, skb_transport_offset(skb),
2780                                                 sizeof(_tcphdr), &_tcphdr);
2781                         if (likely(th))
2782                                 hdr_len += __tcp_hdrlen(th);
2783                 } else {
2784                         struct udphdr _udphdr;
2785
2786                         if (skb_header_pointer(skb, skb_transport_offset(skb),
2787                                                sizeof(_udphdr), &_udphdr))
2788                                 hdr_len += sizeof(struct udphdr);
2789                 }
2790
2791                 if (shinfo->gso_type & SKB_GSO_DODGY)
2792                         gso_segs = DIV_ROUND_UP(skb->len - hdr_len,
2793                                                 shinfo->gso_size);
2794
2795                 qdisc_skb_cb(skb)->pkt_len += (gso_segs - 1) * hdr_len;
2796         }
2797 }
2798
2799 static inline int __dev_xmit_skb(struct sk_buff *skb, struct Qdisc *q,
2800                                  struct net_device *dev,
2801                                  struct netdev_queue *txq)
2802 {
2803         spinlock_t *root_lock = qdisc_lock(q);
2804         bool contended;
2805         int rc;
2806
2807         qdisc_pkt_len_init(skb);
2808         qdisc_calculate_pkt_len(skb, q);
2809         /*
2810          * Heuristic to force contended enqueues to serialize on a
2811          * separate lock before trying to get qdisc main lock.
2812          * This permits __QDISC___STATE_RUNNING owner to get the lock more
2813          * often and dequeue packets faster.
2814          */
2815         contended = qdisc_is_running(q);
2816         if (unlikely(contended))
2817                 spin_lock(&q->busylock);
2818
2819         spin_lock(root_lock);
2820         if (unlikely(test_bit(__QDISC_STATE_DEACTIVATED, &q->state))) {
2821                 kfree_skb(skb);
2822                 rc = NET_XMIT_DROP;
2823         } else if ((q->flags & TCQ_F_CAN_BYPASS) && !qdisc_qlen(q) &&
2824                    qdisc_run_begin(q)) {
2825                 /*
2826                  * This is a work-conserving queue; there are no old skbs
2827                  * waiting to be sent out; and the qdisc is not running -
2828                  * xmit the skb directly.
2829                  */
2830
2831                 qdisc_bstats_update(q, skb);
2832
2833                 if (sch_direct_xmit(skb, q, dev, txq, root_lock, true)) {
2834                         if (unlikely(contended)) {
2835                                 spin_unlock(&q->busylock);
2836                                 contended = false;
2837                         }
2838                         __qdisc_run(q);
2839                 } else
2840                         qdisc_run_end(q);
2841
2842                 rc = NET_XMIT_SUCCESS;
2843         } else {
2844                 rc = q->enqueue(skb, q) & NET_XMIT_MASK;
2845                 if (qdisc_run_begin(q)) {
2846                         if (unlikely(contended)) {
2847                                 spin_unlock(&q->busylock);
2848                                 contended = false;
2849                         }
2850                         __qdisc_run(q);
2851                 }
2852         }
2853         spin_unlock(root_lock);
2854         if (unlikely(contended))
2855                 spin_unlock(&q->busylock);
2856         return rc;
2857 }
2858
2859 #if IS_ENABLED(CONFIG_CGROUP_NET_PRIO)
2860 static void skb_update_prio(struct sk_buff *skb)
2861 {
2862         struct netprio_map *map = rcu_dereference_bh(skb->dev->priomap);
2863
2864         if (!skb->priority && skb->sk && map) {
2865                 unsigned int prioidx = skb->sk->sk_cgrp_prioidx;
2866
2867                 if (prioidx < map->priomap_len)
2868                         skb->priority = map->priomap[prioidx];
2869         }
2870 }
2871 #else
2872 #define skb_update_prio(skb)
2873 #endif
2874
2875 DEFINE_PER_CPU(int, xmit_recursion);
2876 EXPORT_SYMBOL(xmit_recursion);
2877
2878 #define RECURSION_LIMIT 10
2879
2880 /**
2881  *      dev_loopback_xmit - loop back @skb
2882  *      @skb: buffer to transmit
2883  */
2884 int dev_loopback_xmit(struct sk_buff *skb)
2885 {
2886         skb_reset_mac_header(skb);
2887         __skb_pull(skb, skb_network_offset(skb));
2888         skb->pkt_type = PACKET_LOOPBACK;
2889         skb->ip_summed = CHECKSUM_UNNECESSARY;
2890         WARN_ON(!skb_dst(skb));
2891         skb_dst_force(skb);
2892         netif_rx_ni(skb);
2893         return 0;
2894 }
2895 EXPORT_SYMBOL(dev_loopback_xmit);
2896
2897 /**
2898  *      __dev_queue_xmit - transmit a buffer
2899  *      @skb: buffer to transmit
2900  *      @accel_priv: private data used for L2 forwarding offload
2901  *
2902  *      Queue a buffer for transmission to a network device. The caller must
2903  *      have set the device and priority and built the buffer before calling
2904  *      this function. The function can be called from an interrupt.
2905  *
2906  *      A negative errno code is returned on a failure. A success does not
2907  *      guarantee the frame will be transmitted as it may be dropped due
2908  *      to congestion or traffic shaping.
2909  *
2910  * -----------------------------------------------------------------------------------
2911  *      I notice this method can also return errors from the queue disciplines,
2912  *      including NET_XMIT_DROP, which is a positive value.  So, errors can also
2913  *      be positive.
2914  *
2915  *      Regardless of the return value, the skb is consumed, so it is currently
2916  *      difficult to retry a send to this method.  (You can bump the ref count
2917  *      before sending to hold a reference for retry if you are careful.)
2918  *
2919  *      When calling this method, interrupts MUST be enabled.  This is because
2920  *      the BH enable code must have IRQs enabled so that it will not deadlock.
2921  *          --BLG
2922  */
2923 static int __dev_queue_xmit(struct sk_buff *skb, void *accel_priv)
2924 {
2925         struct net_device *dev = skb->dev;
2926         struct netdev_queue *txq;
2927         struct Qdisc *q;
2928         int rc = -ENOMEM;
2929
2930         skb_reset_mac_header(skb);
2931
2932         if (unlikely(skb_shinfo(skb)->tx_flags & SKBTX_SCHED_TSTAMP))
2933                 __skb_tstamp_tx(skb, NULL, skb->sk, SCM_TSTAMP_SCHED);
2934
2935         /* Disable soft irqs for various locks below. Also
2936          * stops preemption for RCU.
2937          */
2938         rcu_read_lock_bh();
2939
2940         skb_update_prio(skb);
2941
2942         /* If device/qdisc don't need skb->dst, release it right now while
2943          * its hot in this cpu cache.
2944          */
2945         if (dev->priv_flags & IFF_XMIT_DST_RELEASE)
2946                 skb_dst_drop(skb);
2947         else
2948                 skb_dst_force(skb);
2949
2950         txq = netdev_pick_tx(dev, skb, accel_priv);
2951         q = rcu_dereference_bh(txq->qdisc);
2952
2953 #ifdef CONFIG_NET_CLS_ACT
2954         skb->tc_verd = SET_TC_AT(skb->tc_verd, AT_EGRESS);
2955 #endif
2956         trace_net_dev_queue(skb);
2957         if (q->enqueue) {
2958                 rc = __dev_xmit_skb(skb, q, dev, txq);
2959                 goto out;
2960         }
2961
2962         /* The device has no queue. Common case for software devices:
2963            loopback, all the sorts of tunnels...
2964
2965            Really, it is unlikely that netif_tx_lock protection is necessary
2966            here.  (f.e. loopback and IP tunnels are clean ignoring statistics
2967            counters.)
2968            However, it is possible, that they rely on protection
2969            made by us here.
2970
2971            Check this and shot the lock. It is not prone from deadlocks.
2972            Either shot noqueue qdisc, it is even simpler 8)
2973          */
2974         if (dev->flags & IFF_UP) {
2975                 int cpu = smp_processor_id(); /* ok because BHs are off */
2976
2977                 if (txq->xmit_lock_owner != cpu) {
2978
2979                         if (__this_cpu_read(xmit_recursion) > RECURSION_LIMIT)
2980                                 goto recursion_alert;
2981
2982                         skb = validate_xmit_skb(skb, dev);
2983                         if (!skb)
2984                                 goto drop;
2985
2986                         HARD_TX_LOCK(dev, txq, cpu);
2987
2988                         if (!netif_xmit_stopped(txq)) {
2989                                 __this_cpu_inc(xmit_recursion);
2990                                 skb = dev_hard_start_xmit(skb, dev, txq, &rc);
2991                                 __this_cpu_dec(xmit_recursion);
2992                                 if (dev_xmit_complete(rc)) {
2993                                         HARD_TX_UNLOCK(dev, txq);
2994                                         goto out;
2995                                 }
2996                         }
2997                         HARD_TX_UNLOCK(dev, txq);
2998                         net_crit_ratelimited("Virtual device %s asks to queue packet!\n",
2999                                              dev->name);
3000                 } else {
3001                         /* Recursion is detected! It is possible,
3002                          * unfortunately
3003                          */
3004 recursion_alert:
3005                         net_crit_ratelimited("Dead loop on virtual device %s, fix it urgently!\n",
3006                                              dev->name);
3007                 }
3008         }
3009
3010         rc = -ENETDOWN;
3011 drop:
3012         rcu_read_unlock_bh();
3013
3014         atomic_long_inc(&dev->tx_dropped);
3015         kfree_skb_list(skb);
3016         return rc;
3017 out:
3018         rcu_read_unlock_bh();
3019         return rc;
3020 }
3021
3022 int dev_queue_xmit(struct sk_buff *skb)
3023 {
3024         return __dev_queue_xmit(skb, NULL);
3025 }
3026 EXPORT_SYMBOL(dev_queue_xmit);
3027
3028 int dev_queue_xmit_accel(struct sk_buff *skb, void *accel_priv)
3029 {
3030         return __dev_queue_xmit(skb, accel_priv);
3031 }
3032 EXPORT_SYMBOL(dev_queue_xmit_accel);
3033
3034
3035 /*=======================================================================
3036                         Receiver routines
3037   =======================================================================*/
3038
3039 int netdev_max_backlog __read_mostly = 1000;
3040 EXPORT_SYMBOL(netdev_max_backlog);
3041
3042 int netdev_tstamp_prequeue __read_mostly = 1;
3043 int netdev_budget __read_mostly = 300;
3044 int weight_p __read_mostly = 64;            /* old backlog weight */
3045
3046 /* Called with irq disabled */
3047 static inline void ____napi_schedule(struct softnet_data *sd,
3048                                      struct napi_struct *napi)
3049 {
3050         list_add_tail(&napi->poll_list, &sd->poll_list);
3051         __raise_softirq_irqoff(NET_RX_SOFTIRQ);
3052 }
3053
3054 #ifdef CONFIG_RPS
3055
3056 /* One global table that all flow-based protocols share. */
3057 struct rps_sock_flow_table __rcu *rps_sock_flow_table __read_mostly;
3058 EXPORT_SYMBOL(rps_sock_flow_table);
3059
3060 struct static_key rps_needed __read_mostly;
3061
3062 static struct rps_dev_flow *
3063 set_rps_cpu(struct net_device *dev, struct sk_buff *skb,
3064             struct rps_dev_flow *rflow, u16 next_cpu)
3065 {
3066         if (next_cpu != RPS_NO_CPU) {
3067 #ifdef CONFIG_RFS_ACCEL
3068                 struct netdev_rx_queue *rxqueue;
3069                 struct rps_dev_flow_table *flow_table;
3070                 struct rps_dev_flow *old_rflow;
3071                 u32 flow_id;
3072                 u16 rxq_index;
3073                 int rc;
3074
3075                 /* Should we steer this flow to a different hardware queue? */
3076                 if (!skb_rx_queue_recorded(skb) || !dev->rx_cpu_rmap ||
3077                     !(dev->features & NETIF_F_NTUPLE))
3078                         goto out;
3079                 rxq_index = cpu_rmap_lookup_index(dev->rx_cpu_rmap, next_cpu);
3080                 if (rxq_index == skb_get_rx_queue(skb))
3081                         goto out;
3082
3083                 rxqueue = dev->_rx + rxq_index;
3084                 flow_table = rcu_dereference(rxqueue->rps_flow_table);
3085                 if (!flow_table)
3086                         goto out;
3087                 flow_id = skb_get_hash(skb) & flow_table->mask;
3088                 rc = dev->netdev_ops->ndo_rx_flow_steer(dev, skb,
3089                                                         rxq_index, flow_id);
3090                 if (rc < 0)
3091                         goto out;
3092                 old_rflow = rflow;
3093                 rflow = &flow_table->flows[flow_id];
3094                 rflow->filter = rc;
3095                 if (old_rflow->filter == rflow->filter)
3096                         old_rflow->filter = RPS_NO_FILTER;
3097         out:
3098 #endif
3099                 rflow->last_qtail =
3100                         per_cpu(softnet_data, next_cpu).input_queue_head;
3101         }
3102
3103         rflow->cpu = next_cpu;
3104         return rflow;
3105 }
3106
3107 /*
3108  * get_rps_cpu is called from netif_receive_skb and returns the target
3109  * CPU from the RPS map of the receiving queue for a given skb.
3110  * rcu_read_lock must be held on entry.
3111  */
3112 static int get_rps_cpu(struct net_device *dev, struct sk_buff *skb,
3113                        struct rps_dev_flow **rflowp)
3114 {
3115         struct netdev_rx_queue *rxqueue;
3116         struct rps_map *map;
3117         struct rps_dev_flow_table *flow_table;
3118         struct rps_sock_flow_table *sock_flow_table;
3119         int cpu = -1;
3120         u16 tcpu;
3121         u32 hash;
3122
3123         if (skb_rx_queue_recorded(skb)) {
3124                 u16 index = skb_get_rx_queue(skb);
3125                 if (unlikely(index >= dev->real_num_rx_queues)) {
3126                         WARN_ONCE(dev->real_num_rx_queues > 1,
3127                                   "%s received packet on queue %u, but number "
3128                                   "of RX queues is %u\n",
3129                                   dev->name, index, dev->real_num_rx_queues);
3130                         goto done;
3131                 }
3132                 rxqueue = dev->_rx + index;
3133         } else
3134                 rxqueue = dev->_rx;
3135
3136         map = rcu_dereference(rxqueue->rps_map);
3137         if (map) {
3138                 if (map->len == 1 &&
3139                     !rcu_access_pointer(rxqueue->rps_flow_table)) {
3140                         tcpu = map->cpus[0];
3141                         if (cpu_online(tcpu))
3142                                 cpu = tcpu;
3143                         goto done;
3144                 }
3145         } else if (!rcu_access_pointer(rxqueue->rps_flow_table)) {
3146                 goto done;
3147         }
3148
3149         skb_reset_network_header(skb);
3150         hash = skb_get_hash(skb);
3151         if (!hash)
3152                 goto done;
3153
3154         flow_table = rcu_dereference(rxqueue->rps_flow_table);
3155         sock_flow_table = rcu_dereference(rps_sock_flow_table);
3156         if (flow_table && sock_flow_table) {
3157                 u16 next_cpu;
3158                 struct rps_dev_flow *rflow;
3159
3160                 rflow = &flow_table->flows[hash & flow_table->mask];
3161                 tcpu = rflow->cpu;
3162
3163                 next_cpu = sock_flow_table->ents[hash & sock_flow_table->mask];
3164
3165                 /*
3166                  * If the desired CPU (where last recvmsg was done) is
3167                  * different from current CPU (one in the rx-queue flow
3168                  * table entry), switch if one of the following holds:
3169                  *   - Current CPU is unset (equal to RPS_NO_CPU).
3170                  *   - Current CPU is offline.
3171                  *   - The current CPU's queue tail has advanced beyond the
3172                  *     last packet that was enqueued using this table entry.
3173                  *     This guarantees that all previous packets for the flow
3174                  *     have been dequeued, thus preserving in order delivery.
3175                  */
3176                 if (unlikely(tcpu != next_cpu) &&
3177                     (tcpu == RPS_NO_CPU || !cpu_online(tcpu) ||
3178                      ((int)(per_cpu(softnet_data, tcpu).input_queue_head -
3179                       rflow->last_qtail)) >= 0)) {
3180                         tcpu = next_cpu;
3181                         rflow = set_rps_cpu(dev, skb, rflow, next_cpu);
3182                 }
3183
3184                 if (tcpu != RPS_NO_CPU && cpu_online(tcpu)) {
3185                         *rflowp = rflow;
3186                         cpu = tcpu;
3187                         goto done;
3188                 }
3189         }
3190
3191         if (map) {
3192                 tcpu = map->cpus[reciprocal_scale(hash, map->len)];
3193                 if (cpu_online(tcpu)) {
3194                         cpu = tcpu;
3195                         goto done;
3196                 }
3197         }
3198
3199 done:
3200         return cpu;
3201 }
3202
3203 #ifdef CONFIG_RFS_ACCEL
3204
3205 /**
3206  * rps_may_expire_flow - check whether an RFS hardware filter may be removed
3207  * @dev: Device on which the filter was set
3208  * @rxq_index: RX queue index
3209  * @flow_id: Flow ID passed to ndo_rx_flow_steer()
3210  * @filter_id: Filter ID returned by ndo_rx_flow_steer()
3211  *
3212  * Drivers that implement ndo_rx_flow_steer() should periodically call
3213  * this function for each installed filter and remove the filters for
3214  * which it returns %true.
3215  */
3216 bool rps_may_expire_flow(struct net_device *dev, u16 rxq_index,
3217                          u32 flow_id, u16 filter_id)
3218 {
3219         struct netdev_rx_queue *rxqueue = dev->_rx + rxq_index;
3220         struct rps_dev_flow_table *flow_table;
3221         struct rps_dev_flow *rflow;
3222         bool expire = true;
3223         int cpu;
3224
3225         rcu_read_lock();
3226         flow_table = rcu_dereference(rxqueue->rps_flow_table);
3227         if (flow_table && flow_id <= flow_table->mask) {
3228                 rflow = &flow_table->flows[flow_id];
3229                 cpu = ACCESS_ONCE(rflow->cpu);
3230                 if (rflow->filter == filter_id && cpu != RPS_NO_CPU &&
3231                     ((int)(per_cpu(softnet_data, cpu).input_queue_head -
3232                            rflow->last_qtail) <
3233                      (int)(10 * flow_table->mask)))
3234                         expire = false;
3235         }
3236         rcu_read_unlock();
3237         return expire;
3238 }
3239 EXPORT_SYMBOL(rps_may_expire_flow);
3240
3241 #endif /* CONFIG_RFS_ACCEL */
3242
3243 /* Called from hardirq (IPI) context */
3244 static void rps_trigger_softirq(void *data)
3245 {
3246         struct softnet_data *sd = data;
3247
3248         ____napi_schedule(sd, &sd->backlog);
3249         sd->received_rps++;
3250 }
3251
3252 #endif /* CONFIG_RPS */
3253
3254 /*
3255  * Check if this softnet_data structure is another cpu one
3256  * If yes, queue it to our IPI list and return 1
3257  * If no, return 0
3258  */
3259 static int rps_ipi_queued(struct softnet_data *sd)
3260 {
3261 #ifdef CONFIG_RPS
3262         struct softnet_data *mysd = this_cpu_ptr(&softnet_data);
3263
3264         if (sd != mysd) {
3265                 sd->rps_ipi_next = mysd->rps_ipi_list;
3266                 mysd->rps_ipi_list = sd;
3267
3268                 __raise_softirq_irqoff(NET_RX_SOFTIRQ);
3269                 return 1;
3270         }
3271 #endif /* CONFIG_RPS */
3272         return 0;
3273 }
3274
3275 #ifdef CONFIG_NET_FLOW_LIMIT
3276 int netdev_flow_limit_table_len __read_mostly = (1 << 12);
3277 #endif
3278
3279 static bool skb_flow_limit(struct sk_buff *skb, unsigned int qlen)
3280 {
3281 #ifdef CONFIG_NET_FLOW_LIMIT
3282         struct sd_flow_limit *fl;
3283         struct softnet_data *sd;
3284         unsigned int old_flow, new_flow;
3285
3286         if (qlen < (netdev_max_backlog >> 1))
3287                 return false;
3288
3289         sd = this_cpu_ptr(&softnet_data);
3290
3291         rcu_read_lock();
3292         fl = rcu_dereference(sd->flow_limit);
3293         if (fl) {
3294                 new_flow = skb_get_hash(skb) & (fl->num_buckets - 1);
3295                 old_flow = fl->history[fl->history_head];
3296                 fl->history[fl->history_head] = new_flow;
3297
3298                 fl->history_head++;
3299                 fl->history_head &= FLOW_LIMIT_HISTORY - 1;
3300
3301                 if (likely(fl->buckets[old_flow]))
3302                         fl->buckets[old_flow]--;
3303
3304                 if (++fl->buckets[new_flow] > (FLOW_LIMIT_HISTORY >> 1)) {
3305                         fl->count++;
3306                         rcu_read_unlock();
3307                         return true;
3308                 }
3309         }
3310         rcu_read_unlock();
3311 #endif
3312         return false;
3313 }
3314
3315 /*
3316  * enqueue_to_backlog is called to queue an skb to a per CPU backlog
3317  * queue (may be a remote CPU queue).
3318  */
3319 static int enqueue_to_backlog(struct sk_buff *skb, int cpu,
3320                               unsigned int *qtail)
3321 {
3322         struct softnet_data *sd;
3323         unsigned long flags;
3324         unsigned int qlen;
3325
3326         sd = &per_cpu(softnet_data, cpu);
3327
3328         local_irq_save(flags);
3329
3330         rps_lock(sd);
3331         if (!netif_running(skb->dev))
3332                 goto drop;
3333         qlen = skb_queue_len(&sd->input_pkt_queue);
3334         if (qlen <= netdev_max_backlog && !skb_flow_limit(skb, qlen)) {
3335                 if (skb_queue_len(&sd->input_pkt_queue)) {
3336 enqueue:
3337                         __skb_queue_tail(&sd->input_pkt_queue, skb);
3338                         input_queue_tail_incr_save(sd, qtail);
3339                         rps_unlock(sd);
3340                         local_irq_restore(flags);
3341                         return NET_RX_SUCCESS;
3342                 }
3343
3344                 /* Schedule NAPI for backlog device
3345                  * We can use non atomic operation since we own the queue lock
3346                  */
3347                 if (!__test_and_set_bit(NAPI_STATE_SCHED, &sd->backlog.state)) {
3348                         if (!rps_ipi_queued(sd))
3349                                 ____napi_schedule(sd, &sd->backlog);
3350                 }
3351                 goto enqueue;
3352         }
3353
3354 drop:
3355         sd->dropped++;
3356         rps_unlock(sd);
3357
3358         local_irq_restore(flags);
3359
3360         atomic_long_inc(&skb->dev->rx_dropped);
3361         kfree_skb(skb);
3362         return NET_RX_DROP;
3363 }
3364
3365 static int netif_rx_internal(struct sk_buff *skb)
3366 {
3367         int ret;
3368
3369         net_timestamp_check(netdev_tstamp_prequeue, skb);
3370
3371         trace_netif_rx(skb);
3372 #ifdef CONFIG_RPS
3373         if (static_key_false(&rps_needed)) {
3374                 struct rps_dev_flow voidflow, *rflow = &voidflow;
3375                 int cpu;
3376
3377                 preempt_disable();
3378                 rcu_read_lock();
3379
3380                 cpu = get_rps_cpu(skb->dev, skb, &rflow);
3381                 if (cpu < 0)
3382                         cpu = smp_processor_id();
3383
3384                 ret = enqueue_to_backlog(skb, cpu, &rflow->last_qtail);
3385
3386                 rcu_read_unlock();
3387                 preempt_enable();
3388         } else
3389 #endif
3390         {
3391                 unsigned int qtail;
3392                 ret = enqueue_to_backlog(skb, get_cpu(), &qtail);
3393                 put_cpu();
3394         }
3395         return ret;
3396 }
3397
3398 /**
3399  *      netif_rx        -       post buffer to the network code
3400  *      @skb: buffer to post
3401  *
3402  *      This function receives a packet from a device driver and queues it for
3403  *      the upper (protocol) levels to process.  It always succeeds. The buffer
3404  *      may be dropped during processing for congestion control or by the
3405  *      protocol layers.
3406  *
3407  *      return values:
3408  *      NET_RX_SUCCESS  (no congestion)
3409  *      NET_RX_DROP     (packet was dropped)
3410  *
3411  */
3412
3413 int netif_rx(struct sk_buff *skb)
3414 {
3415         trace_netif_rx_entry(skb);
3416
3417         return netif_rx_internal(skb);
3418 }
3419 EXPORT_SYMBOL(netif_rx);
3420
3421 int netif_rx_ni(struct sk_buff *skb)
3422 {
3423         int err;
3424
3425         trace_netif_rx_ni_entry(skb);
3426
3427         preempt_disable();
3428         err = netif_rx_internal(skb);
3429         if (local_softirq_pending())
3430                 do_softirq();
3431         preempt_enable();
3432
3433         return err;
3434 }
3435 EXPORT_SYMBOL(netif_rx_ni);
3436
3437 static void net_tx_action(struct softirq_action *h)
3438 {
3439         struct softnet_data *sd = this_cpu_ptr(&softnet_data);
3440
3441         if (sd->completion_queue) {
3442                 struct sk_buff *clist;
3443
3444                 local_irq_disable();
3445                 clist = sd->completion_queue;
3446                 sd->completion_queue = NULL;
3447                 local_irq_enable();
3448
3449                 while (clist) {
3450                         struct sk_buff *skb = clist;
3451                         clist = clist->next;
3452
3453                         WARN_ON(atomic_read(&skb->users));
3454                         if (likely(get_kfree_skb_cb(skb)->reason == SKB_REASON_CONSUMED))
3455                                 trace_consume_skb(skb);
3456                         else
3457                                 trace_kfree_skb(skb, net_tx_action);
3458                         __kfree_skb(skb);
3459                 }
3460         }
3461
3462         if (sd->output_queue) {
3463                 struct Qdisc *head;
3464
3465                 local_irq_disable();
3466                 head = sd->output_queue;
3467                 sd->output_queue = NULL;
3468                 sd->output_queue_tailp = &sd->output_queue;
3469                 local_irq_enable();
3470
3471                 while (head) {
3472                         struct Qdisc *q = head;
3473                         spinlock_t *root_lock;
3474
3475                         head = head->next_sched;
3476
3477                         root_lock = qdisc_lock(q);
3478                         if (spin_trylock(root_lock)) {
3479                                 smp_mb__before_atomic();
3480                                 clear_bit(__QDISC_STATE_SCHED,
3481                                           &q->state);
3482                                 qdisc_run(q);
3483                                 spin_unlock(root_lock);
3484                         } else {
3485                                 if (!test_bit(__QDISC_STATE_DEACTIVATED,
3486                                               &q->state)) {
3487                                         __netif_reschedule(q);
3488                                 } else {
3489                                         smp_mb__before_atomic();
3490                                         clear_bit(__QDISC_STATE_SCHED,
3491                                                   &q->state);
3492                                 }
3493                         }
3494                 }
3495         }
3496 }
3497
3498 #if (defined(CONFIG_BRIDGE) || defined(CONFIG_BRIDGE_MODULE)) && \
3499     (defined(CONFIG_ATM_LANE) || defined(CONFIG_ATM_LANE_MODULE))
3500 /* This hook is defined here for ATM LANE */
3501 int (*br_fdb_test_addr_hook)(struct net_device *dev,
3502                              unsigned char *addr) __read_mostly;
3503 EXPORT_SYMBOL_GPL(br_fdb_test_addr_hook);
3504 #endif
3505
3506 #ifdef CONFIG_NET_CLS_ACT
3507 /* TODO: Maybe we should just force sch_ingress to be compiled in
3508  * when CONFIG_NET_CLS_ACT is? otherwise some useless instructions
3509  * a compare and 2 stores extra right now if we dont have it on
3510  * but have CONFIG_NET_CLS_ACT
3511  * NOTE: This doesn't stop any functionality; if you dont have
3512  * the ingress scheduler, you just can't add policies on ingress.
3513  *
3514  */
3515 static int ing_filter(struct sk_buff *skb, struct netdev_queue *rxq)
3516 {
3517         struct net_device *dev = skb->dev;
3518         u32 ttl = G_TC_RTTL(skb->tc_verd);
3519         int result = TC_ACT_OK;
3520         struct Qdisc *q;
3521
3522         if (unlikely(MAX_RED_LOOP < ttl++)) {
3523                 net_warn_ratelimited("Redir loop detected Dropping packet (%d->%d)\n",
3524                                      skb->skb_iif, dev->ifindex);
3525                 return TC_ACT_SHOT;
3526         }
3527
3528         skb->tc_verd = SET_TC_RTTL(skb->tc_verd, ttl);
3529         skb->tc_verd = SET_TC_AT(skb->tc_verd, AT_INGRESS);
3530
3531         q = rcu_dereference(rxq->qdisc);
3532         if (q != &noop_qdisc) {
3533                 spin_lock(qdisc_lock(q));
3534                 if (likely(!test_bit(__QDISC_STATE_DEACTIVATED, &q->state)))
3535                         result = qdisc_enqueue_root(skb, q);
3536                 spin_unlock(qdisc_lock(q));
3537         }
3538
3539         return result;
3540 }
3541
3542 static inline struct sk_buff *handle_ing(struct sk_buff *skb,
3543                                          struct packet_type **pt_prev,
3544                                          int *ret, struct net_device *orig_dev)
3545 {
3546         struct netdev_queue *rxq = rcu_dereference(skb->dev->ingress_queue);
3547
3548         if (!rxq || rcu_access_pointer(rxq->qdisc) == &noop_qdisc)
3549                 goto out;
3550
3551         if (*pt_prev) {
3552                 *ret = deliver_skb(skb, *pt_prev, orig_dev);
3553                 *pt_prev = NULL;
3554         }
3555
3556         switch (ing_filter(skb, rxq)) {
3557         case TC_ACT_SHOT:
3558         case TC_ACT_STOLEN:
3559                 kfree_skb(skb);
3560                 return NULL;
3561         }
3562
3563 out:
3564         skb->tc_verd = 0;
3565         return skb;
3566 }
3567 #endif
3568
3569 /**
3570  *      netdev_rx_handler_register - register receive handler
3571  *      @dev: device to register a handler for
3572  *      @rx_handler: receive handler to register
3573  *      @rx_handler_data: data pointer that is used by rx handler
3574  *
3575  *      Register a receive handler for a device. This handler will then be
3576  *      called from __netif_receive_skb. A negative errno code is returned
3577  *      on a failure.
3578  *
3579  *      The caller must hold the rtnl_mutex.
3580  *
3581  *      For a general description of rx_handler, see enum rx_handler_result.
3582  */
3583 int netdev_rx_handler_register(struct net_device *dev,
3584                                rx_handler_func_t *rx_handler,
3585                                void *rx_handler_data)
3586 {
3587         ASSERT_RTNL();
3588
3589         if (dev->rx_handler)
3590                 return -EBUSY;
3591
3592         /* Note: rx_handler_data must be set before rx_handler */
3593         rcu_assign_pointer(dev->rx_handler_data, rx_handler_data);
3594         rcu_assign_pointer(dev->rx_handler, rx_handler);
3595
3596         return 0;
3597 }
3598 EXPORT_SYMBOL_GPL(netdev_rx_handler_register);
3599
3600 /**
3601  *      netdev_rx_handler_unregister - unregister receive handler
3602  *      @dev: device to unregister a handler from
3603  *
3604  *      Unregister a receive handler from a device.
3605  *
3606  *      The caller must hold the rtnl_mutex.
3607  */
3608 void netdev_rx_handler_unregister(struct net_device *dev)
3609 {
3610
3611         ASSERT_RTNL();
3612         RCU_INIT_POINTER(dev->rx_handler, NULL);
3613         /* a reader seeing a non NULL rx_handler in a rcu_read_lock()
3614          * section has a guarantee to see a non NULL rx_handler_data
3615          * as well.
3616          */
3617         synchronize_net();
3618         RCU_INIT_POINTER(dev->rx_handler_data, NULL);
3619 }
3620 EXPORT_SYMBOL_GPL(netdev_rx_handler_unregister);
3621
3622 /*
3623  * Limit the use of PFMEMALLOC reserves to those protocols that implement
3624  * the special handling of PFMEMALLOC skbs.
3625  */
3626 static bool skb_pfmemalloc_protocol(struct sk_buff *skb)
3627 {
3628         switch (skb->protocol) {
3629         case htons(ETH_P_ARP):
3630         case htons(ETH_P_IP):
3631         case htons(ETH_P_IPV6):
3632         case htons(ETH_P_8021Q):
3633         case htons(ETH_P_8021AD):
3634                 return true;
3635         default:
3636                 return false;
3637         }
3638 }
3639
3640 static int __netif_receive_skb_core(struct sk_buff *skb, bool pfmemalloc)
3641 {
3642         struct packet_type *ptype, *pt_prev;
3643         rx_handler_func_t *rx_handler;
3644         struct net_device *orig_dev;
3645         struct net_device *null_or_dev;
3646         bool deliver_exact = false;
3647         int ret = NET_RX_DROP;
3648         __be16 type;
3649
3650         net_timestamp_check(!netdev_tstamp_prequeue, skb);
3651
3652         trace_netif_receive_skb(skb);
3653
3654         orig_dev = skb->dev;
3655
3656         skb_reset_network_header(skb);
3657         if (!skb_transport_header_was_set(skb))
3658                 skb_reset_transport_header(skb);
3659         skb_reset_mac_len(skb);
3660
3661         pt_prev = NULL;
3662
3663 another_round:
3664         skb->skb_iif = skb->dev->ifindex;
3665
3666         __this_cpu_inc(softnet_data.processed);
3667
3668         if (skb->protocol == cpu_to_be16(ETH_P_8021Q) ||
3669             skb->protocol == cpu_to_be16(ETH_P_8021AD)) {
3670                 skb = skb_vlan_untag(skb);
3671                 if (unlikely(!skb))
3672                         goto out;
3673         }
3674
3675 #ifdef CONFIG_NET_CLS_ACT
3676         if (skb->tc_verd & TC_NCLS) {
3677                 skb->tc_verd = CLR_TC_NCLS(skb->tc_verd);
3678                 goto ncls;
3679         }
3680 #endif
3681
3682         if (pfmemalloc)
3683                 goto skip_taps;
3684
3685         list_for_each_entry_rcu(ptype, &ptype_all, list) {
3686                 if (!ptype->dev || ptype->dev == skb->dev) {
3687                         if (pt_prev)
3688                                 ret = deliver_skb(skb, pt_prev, orig_dev);
3689                         pt_prev = ptype;
3690                 }
3691         }
3692
3693 skip_taps:
3694 #ifdef CONFIG_NET_CLS_ACT
3695         skb = handle_ing(skb, &pt_prev, &ret, orig_dev);
3696         if (!skb)
3697                 goto out;
3698 ncls:
3699 #endif
3700
3701         if (pfmemalloc && !skb_pfmemalloc_protocol(skb))
3702                 goto drop;
3703
3704         if (vlan_tx_tag_present(skb)) {
3705                 if (pt_prev) {
3706                         ret = deliver_skb(skb, pt_prev, orig_dev);
3707                         pt_prev = NULL;
3708                 }
3709                 if (vlan_do_receive(&skb))
3710                         goto another_round;
3711                 else if (unlikely(!skb))
3712                         goto out;
3713         }
3714
3715         rx_handler = rcu_dereference(skb->dev->rx_handler);
3716         if (rx_handler) {
3717                 if (pt_prev) {
3718                         ret = deliver_skb(skb, pt_prev, orig_dev);
3719                         pt_prev = NULL;
3720                 }
3721                 switch (rx_handler(&skb)) {
3722                 case RX_HANDLER_CONSUMED:
3723                         ret = NET_RX_SUCCESS;
3724                         goto out;
3725                 case RX_HANDLER_ANOTHER:
3726                         goto another_round;
3727                 case RX_HANDLER_EXACT:
3728                         deliver_exact = true;
3729                 case RX_HANDLER_PASS:
3730                         break;
3731                 default:
3732                         BUG();
3733                 }
3734         }
3735
3736         if (unlikely(vlan_tx_tag_present(skb))) {
3737                 if (vlan_tx_tag_get_id(skb))
3738                         skb->pkt_type = PACKET_OTHERHOST;
3739                 /* Note: we might in the future use prio bits
3740                  * and set skb->priority like in vlan_do_receive()
3741                  * For the time being, just ignore Priority Code Point
3742                  */
3743                 skb->vlan_tci = 0;
3744         }
3745
3746         /* deliver only exact match when indicated */
3747         null_or_dev = deliver_exact ? skb->dev : NULL;
3748
3749         type = skb->protocol;
3750         list_for_each_entry_rcu(ptype,
3751                         &ptype_base[ntohs(type) & PTYPE_HASH_MASK], list) {
3752                 if (ptype->type == type &&
3753                     (ptype->dev == null_or_dev || ptype->dev == skb->dev ||
3754                      ptype->dev == orig_dev)) {
3755                         if (pt_prev)
3756                                 ret = deliver_skb(skb, pt_prev, orig_dev);
3757                         pt_prev = ptype;
3758                 }
3759         }
3760
3761         if (pt_prev) {
3762                 if (unlikely(skb_orphan_frags(skb, GFP_ATOMIC)))
3763                         goto drop;
3764                 else
3765                         ret = pt_prev->func(skb, skb->dev, pt_prev, orig_dev);
3766         } else {
3767 drop:
3768                 atomic_long_inc(&skb->dev->rx_dropped);
3769                 kfree_skb(skb);
3770                 /* Jamal, now you will not able to escape explaining
3771                  * me how you were going to use this. :-)
3772                  */
3773                 ret = NET_RX_DROP;
3774         }
3775
3776 out:
3777         return ret;
3778 }
3779
3780 static int __netif_receive_skb(struct sk_buff *skb)
3781 {
3782         int ret;
3783
3784         if (sk_memalloc_socks() && skb_pfmemalloc(skb)) {
3785                 unsigned long pflags = current->flags;
3786
3787                 /*
3788                  * PFMEMALLOC skbs are special, they should
3789                  * - be delivered to SOCK_MEMALLOC sockets only
3790                  * - stay away from userspace
3791                  * - have bounded memory usage
3792                  *
3793                  * Use PF_MEMALLOC as this saves us from propagating the allocation
3794                  * context down to all allocation sites.
3795                  */
3796                 current->flags |= PF_MEMALLOC;
3797                 ret = __netif_receive_skb_core(skb, true);
3798                 tsk_restore_flags(current, pflags, PF_MEMALLOC);
3799         } else
3800                 ret = __netif_receive_skb_core(skb, false);
3801
3802         return ret;
3803 }
3804
3805 static int netif_receive_skb_internal(struct sk_buff *skb)
3806 {
3807         int ret;
3808
3809         net_timestamp_check(netdev_tstamp_prequeue, skb);
3810
3811         if (skb_defer_rx_timestamp(skb))
3812                 return NET_RX_SUCCESS;
3813
3814         rcu_read_lock();
3815
3816 #ifdef CONFIG_RPS
3817         if (static_key_false(&rps_needed)) {
3818                 struct rps_dev_flow voidflow, *rflow = &voidflow;
3819                 int cpu = get_rps_cpu(skb->dev, skb, &rflow);
3820
3821                 if (cpu >= 0) {
3822                         ret = enqueue_to_backlog(skb, cpu, &rflow->last_qtail);
3823                         rcu_read_unlock();
3824                         return ret;
3825                 }
3826         }
3827 #endif
3828         ret = __netif_receive_skb(skb);
3829         rcu_read_unlock();
3830         return ret;
3831 }
3832
3833 /**
3834  *      netif_receive_skb - process receive buffer from network
3835  *      @skb: buffer to process
3836  *
3837  *      netif_receive_skb() is the main receive data processing function.
3838  *      It always succeeds. The buffer may be dropped during processing
3839  *      for congestion control or by the protocol layers.
3840  *
3841  *      This function may only be called from softirq context and interrupts
3842  *      should be enabled.
3843  *
3844  *      Return values (usually ignored):
3845  *      NET_RX_SUCCESS: no congestion
3846  *      NET_RX_DROP: packet was dropped
3847  */
3848 int netif_receive_skb(struct sk_buff *skb)
3849 {
3850         trace_netif_receive_skb_entry(skb);
3851
3852         return netif_receive_skb_internal(skb);
3853 }
3854 EXPORT_SYMBOL(netif_receive_skb);
3855
3856 /* Network device is going away, flush any packets still pending
3857  * Called with irqs disabled.
3858  */
3859 static void flush_backlog(void *arg)
3860 {
3861         struct net_device *dev = arg;
3862         struct softnet_data *sd = this_cpu_ptr(&softnet_data);
3863         struct sk_buff *skb, *tmp;
3864
3865         rps_lock(sd);
3866         skb_queue_walk_safe(&sd->input_pkt_queue, skb, tmp) {
3867                 if (skb->dev == dev) {
3868                         __skb_unlink(skb, &sd->input_pkt_queue);
3869                         kfree_skb(skb);
3870                         input_queue_head_incr(sd);
3871                 }
3872         }
3873         rps_unlock(sd);
3874
3875         skb_queue_walk_safe(&sd->process_queue, skb, tmp) {
3876                 if (skb->dev == dev) {
3877                         __skb_unlink(skb, &sd->process_queue);
3878                         kfree_skb(skb);
3879                         input_queue_head_incr(sd);
3880                 }
3881         }
3882 }
3883
3884 static int napi_gro_complete(struct sk_buff *skb)
3885 {
3886         struct packet_offload *ptype;
3887         __be16 type = skb->protocol;
3888         struct list_head *head = &offload_base;
3889         int err = -ENOENT;
3890
3891         BUILD_BUG_ON(sizeof(struct napi_gro_cb) > sizeof(skb->cb));
3892
3893         if (NAPI_GRO_CB(skb)->count == 1) {
3894                 skb_shinfo(skb)->gso_size = 0;
3895                 goto out;
3896         }
3897
3898         rcu_read_lock();
3899         list_for_each_entry_rcu(ptype, head, list) {
3900                 if (ptype->type != type || !ptype->callbacks.gro_complete)
3901                         continue;
3902
3903                 err = ptype->callbacks.gro_complete(skb, 0);
3904                 break;
3905         }
3906         rcu_read_unlock();
3907
3908         if (err) {
3909                 WARN_ON(&ptype->list == head);
3910                 kfree_skb(skb);
3911                 return NET_RX_SUCCESS;
3912         }
3913
3914 out:
3915         return netif_receive_skb_internal(skb);
3916 }
3917
3918 /* napi->gro_list contains packets ordered by age.
3919  * youngest packets at the head of it.
3920  * Complete skbs in reverse order to reduce latencies.
3921  */
3922 void napi_gro_flush(struct napi_struct *napi, bool flush_old)
3923 {
3924         struct sk_buff *skb, *prev = NULL;
3925
3926         /* scan list and build reverse chain */
3927         for (skb = napi->gro_list; skb != NULL; skb = skb->next) {
3928                 skb->prev = prev;
3929                 prev = skb;
3930         }
3931
3932         for (skb = prev; skb; skb = prev) {
3933                 skb->next = NULL;
3934
3935                 if (flush_old && NAPI_GRO_CB(skb)->age == jiffies)
3936                         return;
3937
3938                 prev = skb->prev;
3939                 napi_gro_complete(skb);
3940                 napi->gro_count--;
3941         }
3942
3943         napi->gro_list = NULL;
3944 }
3945 EXPORT_SYMBOL(napi_gro_flush);
3946
3947 static void gro_list_prepare(struct napi_struct *napi, struct sk_buff *skb)
3948 {
3949         struct sk_buff *p;
3950         unsigned int maclen = skb->dev->hard_header_len;
3951         u32 hash = skb_get_hash_raw(skb);
3952
3953         for (p = napi->gro_list; p; p = p->next) {
3954                 unsigned long diffs;
3955
3956                 NAPI_GRO_CB(p)->flush = 0;
3957
3958                 if (hash != skb_get_hash_raw(p)) {
3959                         NAPI_GRO_CB(p)->same_flow = 0;
3960                         continue;
3961                 }
3962
3963                 diffs = (unsigned long)p->dev ^ (unsigned long)skb->dev;
3964                 diffs |= p->vlan_tci ^ skb->vlan_tci;
3965                 if (maclen == ETH_HLEN)
3966                         diffs |= compare_ether_header(skb_mac_header(p),
3967                                                       skb_mac_header(skb));
3968                 else if (!diffs)
3969                         diffs = memcmp(skb_mac_header(p),
3970                                        skb_mac_header(skb),
3971                                        maclen);
3972                 NAPI_GRO_CB(p)->same_flow = !diffs;
3973         }
3974 }
3975
3976 static void skb_gro_reset_offset(struct sk_buff *skb)
3977 {
3978         const struct skb_shared_info *pinfo = skb_shinfo(skb);
3979         const skb_frag_t *frag0 = &pinfo->frags[0];
3980
3981         NAPI_GRO_CB(skb)->data_offset = 0;
3982         NAPI_GRO_CB(skb)->frag0 = NULL;
3983         NAPI_GRO_CB(skb)->frag0_len = 0;
3984
3985         if (skb_mac_header(skb) == skb_tail_pointer(skb) &&
3986             pinfo->nr_frags &&
3987             !PageHighMem(skb_frag_page(frag0))) {
3988                 NAPI_GRO_CB(skb)->frag0 = skb_frag_address(frag0);
3989                 NAPI_GRO_CB(skb)->frag0_len = skb_frag_size(frag0);
3990         }
3991 }
3992
3993 static void gro_pull_from_frag0(struct sk_buff *skb, int grow)
3994 {
3995         struct skb_shared_info *pinfo = skb_shinfo(skb);
3996
3997         BUG_ON(skb->end - skb->tail < grow);
3998
3999         memcpy(skb_tail_pointer(skb), NAPI_GRO_CB(skb)->frag0, grow);
4000
4001         skb->data_len -= grow;
4002         skb->tail += grow;
4003
4004         pinfo->frags[0].page_offset += grow;
4005         skb_frag_size_sub(&pinfo->frags[0], grow);
4006
4007         if (unlikely(!skb_frag_size(&pinfo->frags[0]))) {
4008                 skb_frag_unref(skb, 0);
4009                 memmove(pinfo->frags, pinfo->frags + 1,
4010                         --pinfo->nr_frags * sizeof(pinfo->frags[0]));
4011         }
4012 }
4013
4014 static enum gro_result dev_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
4015 {
4016         struct sk_buff **pp = NULL;
4017         struct packet_offload *ptype;
4018         __be16 type = skb->protocol;
4019         struct list_head *head = &offload_base;
4020         int same_flow;
4021         enum gro_result ret;
4022         int grow;
4023
4024         if (!(skb->dev->features & NETIF_F_GRO))
4025                 goto normal;
4026
4027         if (skb_is_gso(skb) || skb_has_frag_list(skb) || skb->csum_bad)
4028                 goto normal;
4029
4030         gro_list_prepare(napi, skb);
4031
4032         rcu_read_lock();
4033         list_for_each_entry_rcu(ptype, head, list) {
4034                 if (ptype->type != type || !ptype->callbacks.gro_receive)
4035                         continue;
4036
4037                 skb_set_network_header(skb, skb_gro_offset(skb));
4038                 skb_reset_mac_len(skb);
4039                 NAPI_GRO_CB(skb)->same_flow = 0;
4040                 NAPI_GRO_CB(skb)->flush = 0;
4041                 NAPI_GRO_CB(skb)->free = 0;
4042                 NAPI_GRO_CB(skb)->encap_mark = 0;
4043
4044                 /* Setup for GRO checksum validation */
4045                 switch (skb->ip_summed) {
4046                 case CHECKSUM_COMPLETE:
4047                         NAPI_GRO_CB(skb)->csum = skb->csum;
4048                         NAPI_GRO_CB(skb)->csum_valid = 1;
4049                         NAPI_GRO_CB(skb)->csum_cnt = 0;
4050                         break;
4051                 case CHECKSUM_UNNECESSARY:
4052                         NAPI_GRO_CB(skb)->csum_cnt = skb->csum_level + 1;
4053                         NAPI_GRO_CB(skb)->csum_valid = 0;
4054                         break;
4055                 default:
4056                         NAPI_GRO_CB(skb)->csum_cnt = 0;
4057                         NAPI_GRO_CB(skb)->csum_valid = 0;
4058                 }
4059
4060                 pp = ptype->callbacks.gro_receive(&napi->gro_list, skb);
4061                 break;
4062         }
4063         rcu_read_unlock();
4064
4065         if (&ptype->list == head)
4066                 goto normal;
4067
4068         same_flow = NAPI_GRO_CB(skb)->same_flow;
4069         ret = NAPI_GRO_CB(skb)->free ? GRO_MERGED_FREE : GRO_MERGED;
4070
4071         if (pp) {
4072                 struct sk_buff *nskb = *pp;
4073
4074                 *pp = nskb->next;
4075                 nskb->next = NULL;
4076                 napi_gro_complete(nskb);
4077                 napi->gro_count--;
4078         }
4079
4080         if (same_flow)
4081                 goto ok;
4082
4083         if (NAPI_GRO_CB(skb)->flush)
4084                 goto normal;
4085
4086         if (unlikely(napi->gro_count >= MAX_GRO_SKBS)) {
4087                 struct sk_buff *nskb = napi->gro_list;
4088
4089                 /* locate the end of the list to select the 'oldest' flow */
4090                 while (nskb->next) {
4091                         pp = &nskb->next;
4092                         nskb = *pp;
4093                 }
4094                 *pp = NULL;
4095                 nskb->next = NULL;
4096                 napi_gro_complete(nskb);
4097         } else {
4098                 napi->gro_count++;
4099         }
4100         NAPI_GRO_CB(skb)->count = 1;
4101         NAPI_GRO_CB(skb)->age = jiffies;
4102         NAPI_GRO_CB(skb)->last = skb;
4103         skb_shinfo(skb)->gso_size = skb_gro_len(skb);
4104         skb->next = napi->gro_list;
4105         napi->gro_list = skb;
4106         ret = GRO_HELD;
4107
4108 pull:
4109         grow = skb_gro_offset(skb) - skb_headlen(skb);
4110         if (grow > 0)
4111                 gro_pull_from_frag0(skb, grow);
4112 ok:
4113         return ret;
4114
4115 normal:
4116         ret = GRO_NORMAL;
4117         goto pull;
4118 }
4119
4120 struct packet_offload *gro_find_receive_by_type(__be16 type)
4121 {
4122         struct list_head *offload_head = &offload_base;
4123         struct packet_offload *ptype;
4124
4125         list_for_each_entry_rcu(ptype, offload_head, list) {
4126                 if (ptype->type != type || !ptype->callbacks.gro_receive)
4127                         continue;
4128                 return ptype;
4129         }
4130         return NULL;
4131 }
4132 EXPORT_SYMBOL(gro_find_receive_by_type);
4133
4134 struct packet_offload *gro_find_complete_by_type(__be16 type)
4135 {
4136         struct list_head *offload_head = &offload_base;
4137         struct packet_offload *ptype;
4138
4139         list_for_each_entry_rcu(ptype, offload_head, list) {
4140                 if (ptype->type != type || !ptype->callbacks.gro_complete)
4141                         continue;
4142                 return ptype;
4143         }
4144         return NULL;
4145 }
4146 EXPORT_SYMBOL(gro_find_complete_by_type);
4147
4148 static gro_result_t napi_skb_finish(gro_result_t ret, struct sk_buff *skb)
4149 {
4150         switch (ret) {
4151         case GRO_NORMAL:
4152                 if (netif_receive_skb_internal(skb))
4153                         ret = GRO_DROP;
4154                 break;
4155
4156         case GRO_DROP:
4157                 kfree_skb(skb);
4158                 break;
4159
4160         case GRO_MERGED_FREE:
4161                 if (NAPI_GRO_CB(skb)->free == NAPI_GRO_FREE_STOLEN_HEAD)
4162                         kmem_cache_free(skbuff_head_cache, skb);
4163                 else
4164                         __kfree_skb(skb);
4165                 break;
4166
4167         case GRO_HELD:
4168         case GRO_MERGED:
4169                 break;
4170         }
4171
4172         return ret;
4173 }
4174
4175 gro_result_t napi_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
4176 {
4177         trace_napi_gro_receive_entry(skb);
4178
4179         skb_gro_reset_offset(skb);
4180
4181         return napi_skb_finish(dev_gro_receive(napi, skb), skb);
4182 }
4183 EXPORT_SYMBOL(napi_gro_receive);
4184
4185 static void napi_reuse_skb(struct napi_struct *napi, struct sk_buff *skb)
4186 {
4187         if (unlikely(skb->pfmemalloc)) {
4188                 consume_skb(skb);
4189                 return;
4190         }
4191         __skb_pull(skb, skb_headlen(skb));
4192         /* restore the reserve we had after netdev_alloc_skb_ip_align() */
4193         skb_reserve(skb, NET_SKB_PAD + NET_IP_ALIGN - skb_headroom(skb));
4194         skb->vlan_tci = 0;
4195         skb->dev = napi->dev;
4196         skb->skb_iif = 0;
4197         skb->encapsulation = 0;
4198         skb_shinfo(skb)->gso_type = 0;
4199         skb->truesize = SKB_TRUESIZE(skb_end_offset(skb));
4200
4201         napi->skb = skb;
4202 }
4203
4204 struct sk_buff *napi_get_frags(struct napi_struct *napi)
4205 {
4206         struct sk_buff *skb = napi->skb;
4207
4208         if (!skb) {
4209                 skb = netdev_alloc_skb_ip_align(napi->dev, GRO_MAX_HEAD);
4210                 napi->skb = skb;
4211         }
4212         return skb;
4213 }
4214 EXPORT_SYMBOL(napi_get_frags);
4215
4216 static gro_result_t napi_frags_finish(struct napi_struct *napi,
4217                                       struct sk_buff *skb,
4218                                       gro_result_t ret)
4219 {
4220         switch (ret) {
4221         case GRO_NORMAL:
4222         case GRO_HELD:
4223                 __skb_push(skb, ETH_HLEN);
4224                 skb->protocol = eth_type_trans(skb, skb->dev);
4225                 if (ret == GRO_NORMAL && netif_receive_skb_internal(skb))
4226                         ret = GRO_DROP;
4227                 break;
4228
4229         case GRO_DROP:
4230         case GRO_MERGED_FREE:
4231                 napi_reuse_skb(napi, skb);
4232                 break;
4233
4234         case GRO_MERGED:
4235                 break;
4236         }
4237
4238         return ret;
4239 }
4240
4241 /* Upper GRO stack assumes network header starts at gro_offset=0
4242  * Drivers could call both napi_gro_frags() and napi_gro_receive()
4243  * We copy ethernet header into skb->data to have a common layout.
4244  */
4245 static struct sk_buff *napi_frags_skb(struct napi_struct *napi)
4246 {
4247         struct sk_buff *skb = napi->skb;
4248         const struct ethhdr *eth;
4249         unsigned int hlen = sizeof(*eth);
4250
4251         napi->skb = NULL;
4252
4253         skb_reset_mac_header(skb);
4254         skb_gro_reset_offset(skb);
4255
4256         eth = skb_gro_header_fast(skb, 0);
4257         if (unlikely(skb_gro_header_hard(skb, hlen))) {
4258                 eth = skb_gro_header_slow(skb, hlen, 0);
4259                 if (unlikely(!eth)) {
4260                         napi_reuse_skb(napi, skb);
4261                         return NULL;
4262                 }
4263         } else {
4264                 gro_pull_from_frag0(skb, hlen);
4265                 NAPI_GRO_CB(skb)->frag0 += hlen;
4266                 NAPI_GRO_CB(skb)->frag0_len -= hlen;
4267         }
4268         __skb_pull(skb, hlen);
4269
4270         /*
4271          * This works because the only protocols we care about don't require
4272          * special handling.
4273          * We'll fix it up properly in napi_frags_finish()
4274          */
4275         skb->protocol = eth->h_proto;
4276
4277         return skb;
4278 }
4279
4280 gro_result_t napi_gro_frags(struct napi_struct *napi)
4281 {
4282         struct sk_buff *skb = napi_frags_skb(napi);
4283
4284         if (!skb)
4285                 return GRO_DROP;
4286
4287         trace_napi_gro_frags_entry(skb);
4288
4289         return napi_frags_finish(napi, skb, dev_gro_receive(napi, skb));
4290 }
4291 EXPORT_SYMBOL(napi_gro_frags);
4292
4293 /* Compute the checksum from gro_offset and return the folded value
4294  * after adding in any pseudo checksum.
4295  */
4296 __sum16 __skb_gro_checksum_complete(struct sk_buff *skb)
4297 {
4298         __wsum wsum;
4299         __sum16 sum;
4300
4301         wsum = skb_checksum(skb, skb_gro_offset(skb), skb_gro_len(skb), 0);
4302
4303         /* NAPI_GRO_CB(skb)->csum holds pseudo checksum */
4304         sum = csum_fold(csum_add(NAPI_GRO_CB(skb)->csum, wsum));
4305         if (likely(!sum)) {
4306                 if (unlikely(skb->ip_summed == CHECKSUM_COMPLETE) &&
4307                     !skb->csum_complete_sw)
4308                         netdev_rx_csum_fault(skb->dev);
4309         }
4310
4311         NAPI_GRO_CB(skb)->csum = wsum;
4312         NAPI_GRO_CB(skb)->csum_valid = 1;
4313
4314         return sum;
4315 }
4316 EXPORT_SYMBOL(__skb_gro_checksum_complete);
4317
4318 /*
4319  * net_rps_action_and_irq_enable sends any pending IPI's for rps.
4320  * Note: called with local irq disabled, but exits with local irq enabled.
4321  */
4322 static void net_rps_action_and_irq_enable(struct softnet_data *sd)
4323 {
4324 #ifdef CONFIG_RPS
4325         struct softnet_data *remsd = sd->rps_ipi_list;
4326
4327         if (remsd) {
4328                 sd->rps_ipi_list = NULL;
4329
4330                 local_irq_enable();
4331
4332                 /* Send pending IPI's to kick RPS processing on remote cpus. */
4333                 while (remsd) {
4334                         struct softnet_data *next = remsd->rps_ipi_next;
4335
4336                         if (cpu_online(remsd->cpu))
4337                                 smp_call_function_single_async(remsd->cpu,
4338                                                            &remsd->csd);
4339                         remsd = next;
4340                 }
4341         } else
4342 #endif
4343                 local_irq_enable();
4344 }
4345
4346 static int process_backlog(struct napi_struct *napi, int quota)
4347 {
4348         int work = 0;
4349         struct softnet_data *sd = container_of(napi, struct softnet_data, backlog);
4350
4351 #ifdef CONFIG_RPS
4352         /* Check if we have pending ipi, its better to send them now,
4353          * not waiting net_rx_action() end.
4354          */
4355         if (sd->rps_ipi_list) {
4356                 local_irq_disable();
4357                 net_rps_action_and_irq_enable(sd);
4358         }
4359 #endif
4360         napi->weight = weight_p;
4361         local_irq_disable();
4362         while (1) {
4363                 struct sk_buff *skb;
4364
4365                 while ((skb = __skb_dequeue(&sd->process_queue))) {
4366                         rcu_read_lock();
4367                         local_irq_enable();
4368                         __netif_receive_skb(skb);
4369                         rcu_read_unlock();
4370                         local_irq_disable();
4371                         input_queue_head_incr(sd);
4372                         if (++work >= quota) {
4373                                 local_irq_enable();
4374                                 return work;
4375                         }
4376                 }
4377
4378                 rps_lock(sd);
4379                 if (skb_queue_empty(&sd->input_pkt_queue)) {
4380                         /*
4381                          * Inline a custom version of __napi_complete().
4382                          * only current cpu owns and manipulates this napi,
4383                          * and NAPI_STATE_SCHED is the only possible flag set
4384                          * on backlog.
4385                          * We can use a plain write instead of clear_bit(),
4386                          * and we dont need an smp_mb() memory barrier.
4387                          */
4388                         list_del(&napi->poll_list);
4389                         napi->state = 0;
4390                         rps_unlock(sd);
4391
4392                         break;
4393                 }
4394
4395                 skb_queue_splice_tail_init(&sd->input_pkt_queue,
4396                                            &sd->process_queue);
4397                 rps_unlock(sd);
4398         }
4399         local_irq_enable();
4400
4401         return work;
4402 }
4403
4404 /**
4405  * __napi_schedule - schedule for receive
4406  * @n: entry to schedule
4407  *
4408  * The entry's receive function will be scheduled to run
4409  */
4410 void __napi_schedule(struct napi_struct *n)
4411 {
4412         unsigned long flags;
4413
4414         local_irq_save(flags);
4415         ____napi_schedule(this_cpu_ptr(&softnet_data), n);
4416         local_irq_restore(flags);
4417 }
4418 EXPORT_SYMBOL(__napi_schedule);
4419
4420 void __napi_complete(struct napi_struct *n)
4421 {
4422         BUG_ON(!test_bit(NAPI_STATE_SCHED, &n->state));
4423         BUG_ON(n->gro_list);
4424
4425         list_del(&n->poll_list);
4426         smp_mb__before_atomic();
4427         clear_bit(NAPI_STATE_SCHED, &n->state);
4428 }
4429 EXPORT_SYMBOL(__napi_complete);
4430
4431 void napi_complete(struct napi_struct *n)
4432 {
4433         unsigned long flags;
4434
4435         /*
4436          * don't let napi dequeue from the cpu poll list
4437          * just in case its running on a different cpu
4438          */
4439         if (unlikely(test_bit(NAPI_STATE_NPSVC, &n->state)))
4440                 return;
4441
4442         napi_gro_flush(n, false);
4443         local_irq_save(flags);
4444         __napi_complete(n);
4445         local_irq_restore(flags);
4446 }
4447 EXPORT_SYMBOL(napi_complete);
4448
4449 /* must be called under rcu_read_lock(), as we dont take a reference */
4450 struct napi_struct *napi_by_id(unsigned int napi_id)
4451 {
4452         unsigned int hash = napi_id % HASH_SIZE(napi_hash);
4453         struct napi_struct *napi;
4454
4455         hlist_for_each_entry_rcu(napi, &napi_hash[hash], napi_hash_node)
4456                 if (napi->napi_id == napi_id)
4457                         return napi;
4458
4459         return NULL;
4460 }
4461 EXPORT_SYMBOL_GPL(napi_by_id);
4462
4463 void napi_hash_add(struct napi_struct *napi)
4464 {
4465         if (!test_and_set_bit(NAPI_STATE_HASHED, &napi->state)) {
4466
4467                 spin_lock(&napi_hash_lock);
4468
4469                 /* 0 is not a valid id, we also skip an id that is taken
4470                  * we expect both events to be extremely rare
4471                  */
4472                 napi->napi_id = 0;
4473                 while (!napi->napi_id) {
4474                         napi->napi_id = ++napi_gen_id;
4475                         if (napi_by_id(napi->napi_id))
4476                                 napi->napi_id = 0;
4477                 }
4478
4479                 hlist_add_head_rcu(&napi->napi_hash_node,
4480                         &napi_hash[napi->napi_id % HASH_SIZE(napi_hash)]);
4481
4482                 spin_unlock(&napi_hash_lock);
4483         }
4484 }
4485 EXPORT_SYMBOL_GPL(napi_hash_add);
4486
4487 /* Warning : caller is responsible to make sure rcu grace period
4488  * is respected before freeing memory containing @napi
4489  */
4490 void napi_hash_del(struct napi_struct *napi)
4491 {
4492         spin_lock(&napi_hash_lock);
4493
4494         if (test_and_clear_bit(NAPI_STATE_HASHED, &napi->state))
4495                 hlist_del_rcu(&napi->napi_hash_node);
4496
4497         spin_unlock(&napi_hash_lock);
4498 }
4499 EXPORT_SYMBOL_GPL(napi_hash_del);
4500
4501 void netif_napi_add(struct net_device *dev, struct napi_struct *napi,
4502                     int (*poll)(struct napi_struct *, int), int weight)
4503 {
4504         INIT_LIST_HEAD(&napi->poll_list);
4505         napi->gro_count = 0;
4506         napi->gro_list = NULL;
4507         napi->skb = NULL;
4508         napi->poll = poll;
4509         if (weight > NAPI_POLL_WEIGHT)
4510                 pr_err_once("netif_napi_add() called with weight %d on device %s\n",
4511                             weight, dev->name);
4512         napi->weight = weight;
4513         list_add(&napi->dev_list, &dev->napi_list);
4514         napi->dev = dev;
4515 #ifdef CONFIG_NETPOLL
4516         spin_lock_init(&napi->poll_lock);
4517         napi->poll_owner = -1;
4518 #endif
4519         set_bit(NAPI_STATE_SCHED, &napi->state);
4520 }
4521 EXPORT_SYMBOL(netif_napi_add);
4522
4523 void netif_napi_del(struct napi_struct *napi)
4524 {
4525         list_del_init(&napi->dev_list);
4526         napi_free_frags(napi);
4527
4528         kfree_skb_list(napi->gro_list);
4529         napi->gro_list = NULL;
4530         napi->gro_count = 0;
4531 }
4532 EXPORT_SYMBOL(netif_napi_del);
4533
4534 static void net_rx_action(struct softirq_action *h)
4535 {
4536         struct softnet_data *sd = this_cpu_ptr(&softnet_data);
4537         unsigned long time_limit = jiffies + 2;
4538         int budget = netdev_budget;
4539         void *have;
4540
4541         local_irq_disable();
4542
4543         while (!list_empty(&sd->poll_list)) {
4544                 struct napi_struct *n;
4545                 int work, weight;
4546
4547                 /* If softirq window is exhuasted then punt.
4548                  * Allow this to run for 2 jiffies since which will allow
4549                  * an average latency of 1.5/HZ.
4550                  */
4551                 if (unlikely(budget <= 0 || time_after_eq(jiffies, time_limit)))
4552                         goto softnet_break;
4553
4554                 local_irq_enable();
4555
4556                 /* Even though interrupts have been re-enabled, this
4557                  * access is safe because interrupts can only add new
4558                  * entries to the tail of this list, and only ->poll()
4559                  * calls can remove this head entry from the list.
4560                  */
4561                 n = list_first_entry(&sd->poll_list, struct napi_struct, poll_list);
4562
4563                 have = netpoll_poll_lock(n);
4564
4565                 weight = n->weight;
4566
4567                 /* This NAPI_STATE_SCHED test is for avoiding a race
4568                  * with netpoll's poll_napi().  Only the entity which
4569                  * obtains the lock and sees NAPI_STATE_SCHED set will
4570                  * actually make the ->poll() call.  Therefore we avoid
4571                  * accidentally calling ->poll() when NAPI is not scheduled.
4572                  */
4573                 work = 0;
4574                 if (test_bit(NAPI_STATE_SCHED, &n->state)) {
4575                         work = n->poll(n, weight);
4576                         trace_napi_poll(n);
4577                 }
4578
4579                 WARN_ON_ONCE(work > weight);
4580
4581                 budget -= work;
4582
4583                 local_irq_disable();
4584
4585                 /* Drivers must not modify the NAPI state if they
4586                  * consume the entire weight.  In such cases this code
4587                  * still "owns" the NAPI instance and therefore can
4588                  * move the instance around on the list at-will.
4589                  */
4590                 if (unlikely(work == weight)) {
4591                         if (unlikely(napi_disable_pending(n))) {
4592                                 local_irq_enable();
4593                                 napi_complete(n);
4594                                 local_irq_disable();
4595                         } else {
4596                                 if (n->gro_list) {
4597                                         /* flush too old packets
4598                                          * If HZ < 1000, flush all packets.
4599                                          */
4600                                         local_irq_enable();
4601                                         napi_gro_flush(n, HZ >= 1000);
4602                                         local_irq_disable();
4603                                 }
4604                                 list_move_tail(&n->poll_list, &sd->poll_list);
4605                         }
4606                 }
4607
4608                 netpoll_poll_unlock(have);
4609         }
4610 out:
4611         net_rps_action_and_irq_enable(sd);
4612
4613         return;
4614
4615 softnet_break:
4616         sd->time_squeeze++;
4617         __raise_softirq_irqoff(NET_RX_SOFTIRQ);
4618         goto out;
4619 }
4620
4621 struct netdev_adjacent {
4622         struct net_device *dev;
4623
4624         /* upper master flag, there can only be one master device per list */
4625         bool master;
4626
4627         /* counter for the number of times this device was added to us */
4628         u16 ref_nr;
4629
4630         /* private field for the users */
4631         void *private;
4632
4633         struct list_head list;
4634         struct rcu_head rcu;
4635 };
4636
4637 static struct netdev_adjacent *__netdev_find_adj(struct net_device *dev,
4638                                                  struct net_device *adj_dev,
4639                                                  struct list_head *adj_list)
4640 {
4641         struct netdev_adjacent *adj;
4642
4643         list_for_each_entry(adj, adj_list, list) {
4644                 if (adj->dev == adj_dev)
4645                         return adj;
4646         }
4647         return NULL;
4648 }
4649
4650 /**
4651  * netdev_has_upper_dev - Check if device is linked to an upper device
4652  * @dev: device
4653  * @upper_dev: upper device to check
4654  *
4655  * Find out if a device is linked to specified upper device and return true
4656  * in case it is. Note that this checks only immediate upper device,
4657  * not through a complete stack of devices. The caller must hold the RTNL lock.
4658  */
4659 bool netdev_has_upper_dev(struct net_device *dev,
4660                           struct net_device *upper_dev)
4661 {
4662         ASSERT_RTNL();
4663
4664         return __netdev_find_adj(dev, upper_dev, &dev->all_adj_list.upper);
4665 }
4666 EXPORT_SYMBOL(netdev_has_upper_dev);
4667
4668 /**
4669  * netdev_has_any_upper_dev - Check if device is linked to some device
4670  * @dev: device
4671  *
4672  * Find out if a device is linked to an upper device and return true in case
4673  * it is. The caller must hold the RTNL lock.
4674  */
4675 static bool netdev_has_any_upper_dev(struct net_device *dev)
4676 {
4677         ASSERT_RTNL();
4678
4679         return !list_empty(&dev->all_adj_list.upper);
4680 }
4681
4682 /**
4683  * netdev_master_upper_dev_get - Get master upper device
4684  * @dev: device
4685  *
4686  * Find a master upper device and return pointer to it or NULL in case
4687  * it's not there. The caller must hold the RTNL lock.
4688  */
4689 struct net_device *netdev_master_upper_dev_get(struct net_device *dev)
4690 {
4691         struct netdev_adjacent *upper;
4692
4693         ASSERT_RTNL();
4694
4695         if (list_empty(&dev->adj_list.upper))
4696                 return NULL;
4697
4698         upper = list_first_entry(&dev->adj_list.upper,
4699                                  struct netdev_adjacent, list);
4700         if (likely(upper->master))
4701                 return upper->dev;
4702         return NULL;
4703 }
4704 EXPORT_SYMBOL(netdev_master_upper_dev_get);
4705
4706 void *netdev_adjacent_get_private(struct list_head *adj_list)
4707 {
4708         struct netdev_adjacent *adj;
4709
4710         adj = list_entry(adj_list, struct netdev_adjacent, list);
4711
4712         return adj->private;
4713 }
4714 EXPORT_SYMBOL(netdev_adjacent_get_private);
4715
4716 /**
4717  * netdev_upper_get_next_dev_rcu - Get the next dev from upper list
4718  * @dev: device
4719  * @iter: list_head ** of the current position
4720  *
4721  * Gets the next device from the dev's upper list, starting from iter
4722  * position. The caller must hold RCU read lock.
4723  */
4724 struct net_device *netdev_upper_get_next_dev_rcu(struct net_device *dev,
4725                                                  struct list_head **iter)
4726 {
4727         struct netdev_adjacent *upper;
4728
4729         WARN_ON_ONCE(!rcu_read_lock_held() && !lockdep_rtnl_is_held());
4730
4731         upper = list_entry_rcu((*iter)->next, struct netdev_adjacent, list);
4732
4733         if (&upper->list == &dev->adj_list.upper)
4734                 return NULL;
4735
4736         *iter = &upper->list;
4737
4738         return upper->dev;
4739 }
4740 EXPORT_SYMBOL(netdev_upper_get_next_dev_rcu);
4741
4742 /**
4743  * netdev_all_upper_get_next_dev_rcu - Get the next dev from upper list
4744  * @dev: device
4745  * @iter: list_head ** of the current position
4746  *
4747  * Gets the next device from the dev's upper list, starting from iter
4748  * position. The caller must hold RCU read lock.
4749  */
4750 struct net_device *netdev_all_upper_get_next_dev_rcu(struct net_device *dev,
4751                                                      struct list_head **iter)
4752 {
4753         struct netdev_adjacent *upper;
4754
4755         WARN_ON_ONCE(!rcu_read_lock_held() && !lockdep_rtnl_is_held());
4756
4757         upper = list_entry_rcu((*iter)->next, struct netdev_adjacent, list);
4758
4759         if (&upper->list == &dev->all_adj_list.upper)
4760                 return NULL;
4761
4762         *iter = &upper->list;
4763
4764         return upper->dev;
4765 }
4766 EXPORT_SYMBOL(netdev_all_upper_get_next_dev_rcu);
4767
4768 /**
4769  * netdev_lower_get_next_private - Get the next ->private from the
4770  *                                 lower neighbour list
4771  * @dev: device
4772  * @iter: list_head ** of the current position
4773  *
4774  * Gets the next netdev_adjacent->private from the dev's lower neighbour
4775  * list, starting from iter position. The caller must hold either hold the
4776  * RTNL lock or its own locking that guarantees that the neighbour lower
4777  * list will remain unchainged.
4778  */
4779 void *netdev_lower_get_next_private(struct net_device *dev,
4780                                     struct list_head **iter)
4781 {
4782         struct netdev_adjacent *lower;
4783
4784         lower = list_entry(*iter, struct netdev_adjacent, list);
4785
4786         if (&lower->list == &dev->adj_list.lower)
4787                 return NULL;
4788
4789         *iter = lower->list.next;
4790
4791         return lower->private;
4792 }
4793 EXPORT_SYMBOL(netdev_lower_get_next_private);
4794
4795 /**
4796  * netdev_lower_get_next_private_rcu - Get the next ->private from the
4797  *                                     lower neighbour list, RCU
4798  *                                     variant
4799  * @dev: device
4800  * @iter: list_head ** of the current position
4801  *
4802  * Gets the next netdev_adjacent->private from the dev's lower neighbour
4803  * list, starting from iter position. The caller must hold RCU read lock.
4804  */
4805 void *netdev_lower_get_next_private_rcu(struct net_device *dev,
4806                                         struct list_head **iter)
4807 {
4808         struct netdev_adjacent *lower;
4809
4810         WARN_ON_ONCE(!rcu_read_lock_held());
4811
4812         lower = list_entry_rcu((*iter)->next, struct netdev_adjacent, list);
4813
4814         if (&lower->list == &dev->adj_list.lower)
4815                 return NULL;
4816
4817         *iter = &lower->list;
4818
4819         return lower->private;
4820 }
4821 EXPORT_SYMBOL(netdev_lower_get_next_private_rcu);
4822
4823 /**
4824  * netdev_lower_get_next - Get the next device from the lower neighbour
4825  *                         list
4826  * @dev: device
4827  * @iter: list_head ** of the current position
4828  *
4829  * Gets the next netdev_adjacent from the dev's lower neighbour
4830  * list, starting from iter position. The caller must hold RTNL lock or
4831  * its own locking that guarantees that the neighbour lower
4832  * list will remain unchainged.
4833  */
4834 void *netdev_lower_get_next(struct net_device *dev, struct list_head **iter)
4835 {
4836         struct netdev_adjacent *lower;
4837
4838         lower = list_entry((*iter)->next, struct netdev_adjacent, list);
4839
4840         if (&lower->list == &dev->adj_list.lower)
4841                 return NULL;
4842
4843         *iter = &lower->list;
4844
4845         return lower->dev;
4846 }
4847 EXPORT_SYMBOL(netdev_lower_get_next);
4848
4849 /**
4850  * netdev_lower_get_first_private_rcu - Get the first ->private from the
4851  *                                     lower neighbour list, RCU
4852  *                                     variant
4853  * @dev: device
4854  *
4855  * Gets the first netdev_adjacent->private from the dev's lower neighbour
4856  * list. The caller must hold RCU read lock.
4857  */
4858 void *netdev_lower_get_first_private_rcu(struct net_device *dev)
4859 {
4860         struct netdev_adjacent *lower;
4861
4862         lower = list_first_or_null_rcu(&dev->adj_list.lower,
4863                         struct netdev_adjacent, list);
4864         if (lower)
4865                 return lower->private;
4866         return NULL;
4867 }
4868 EXPORT_SYMBOL(netdev_lower_get_first_private_rcu);
4869
4870 /**
4871  * netdev_master_upper_dev_get_rcu - Get master upper device
4872  * @dev: device
4873  *
4874  * Find a master upper device and return pointer to it or NULL in case
4875  * it's not there. The caller must hold the RCU read lock.
4876  */
4877 struct net_device *netdev_master_upper_dev_get_rcu(struct net_device *dev)
4878 {
4879         struct netdev_adjacent *upper;
4880
4881         upper = list_first_or_null_rcu(&dev->adj_list.upper,
4882                                        struct netdev_adjacent, list);
4883         if (upper && likely(upper->master))
4884                 return upper->dev;
4885         return NULL;
4886 }
4887 EXPORT_SYMBOL(netdev_master_upper_dev_get_rcu);
4888
4889 static int netdev_adjacent_sysfs_add(struct net_device *dev,
4890                               struct net_device *adj_dev,
4891                               struct list_head *dev_list)
4892 {
4893         char linkname[IFNAMSIZ+7];
4894         sprintf(linkname, dev_list == &dev->adj_list.upper ?
4895                 "upper_%s" : "lower_%s", adj_dev->name);
4896         return sysfs_create_link(&(dev->dev.kobj), &(adj_dev->dev.kobj),
4897                                  linkname);
4898 }
4899 static void netdev_adjacent_sysfs_del(struct net_device *dev,
4900                                char *name,
4901                                struct list_head *dev_list)
4902 {
4903         char linkname[IFNAMSIZ+7];
4904         sprintf(linkname, dev_list == &dev->adj_list.upper ?
4905                 "upper_%s" : "lower_%s", name);
4906         sysfs_remove_link(&(dev->dev.kobj), linkname);
4907 }
4908
4909 static inline bool netdev_adjacent_is_neigh_list(struct net_device *dev,
4910                                                  struct net_device *adj_dev,
4911                                                  struct list_head *dev_list)
4912 {
4913         return (dev_list == &dev->adj_list.upper ||
4914                 dev_list == &dev->adj_list.lower) &&
4915                 net_eq(dev_net(dev), dev_net(adj_dev));
4916 }
4917
4918 static int __netdev_adjacent_dev_insert(struct net_device *dev,
4919                                         struct net_device *adj_dev,
4920                                         u16 ref_nr,
4921                                         struct list_head *dev_list,
4922                                         void *private, bool master)
4923 {
4924         struct netdev_adjacent *adj;
4925         int ret;
4926
4927         adj = __netdev_find_adj(dev, adj_dev, dev_list);
4928
4929         if (adj) {
4930                 adj->ref_nr += ref_nr;
4931                 return 0;
4932         }
4933
4934         adj = kmalloc(sizeof(*adj), GFP_KERNEL);
4935         if (!adj)
4936                 return -ENOMEM;
4937
4938         adj->dev = adj_dev;
4939         adj->master = master;
4940         adj->ref_nr = ref_nr;
4941         adj->private = private;
4942         dev_hold(adj_dev);
4943
4944         pr_debug("dev_hold for %s, because of link added from %s to %s\n",
4945                  adj_dev->name, dev->name, adj_dev->name);
4946
4947         if (netdev_adjacent_is_neigh_list(dev, adj_dev, dev_list)) {
4948                 ret = netdev_adjacent_sysfs_add(dev, adj_dev, dev_list);
4949                 if (ret)
4950                         goto free_adj;
4951         }
4952
4953         /* Ensure that master link is always the first item in list. */
4954         if (master) {
4955                 ret = sysfs_create_link(&(dev->dev.kobj),
4956                                         &(adj_dev->dev.kobj), "master");
4957                 if (ret)
4958                         goto remove_symlinks;
4959
4960                 list_add_rcu(&adj->list, dev_list);
4961         } else {
4962                 list_add_tail_rcu(&adj->list, dev_list);
4963         }
4964
4965         return 0;
4966
4967 remove_symlinks:
4968         if (netdev_adjacent_is_neigh_list(dev, adj_dev, dev_list))
4969                 netdev_adjacent_sysfs_del(dev, adj_dev->name, dev_list);
4970 free_adj:
4971         kfree(adj);
4972         dev_put(adj_dev);
4973
4974         return ret;
4975 }
4976
4977 static void __netdev_adjacent_dev_remove(struct net_device *dev,
4978                                          struct net_device *adj_dev,
4979                                          u16 ref_nr,
4980                                          struct list_head *dev_list)
4981 {
4982         struct netdev_adjacent *adj;
4983
4984         adj = __netdev_find_adj(dev, adj_dev, dev_list);
4985
4986         if (!adj) {
4987                 pr_err("tried to remove device %s from %s\n",
4988                        dev->name, adj_dev->name);
4989                 BUG();
4990         }
4991
4992         if (adj->ref_nr > ref_nr) {
4993                 pr_debug("%s to %s ref_nr-%d = %d\n", dev->name, adj_dev->name,
4994                          ref_nr, adj->ref_nr-ref_nr);
4995                 adj->ref_nr -= ref_nr;
4996                 return;
4997         }
4998
4999         if (adj->master)
5000                 sysfs_remove_link(&(dev->dev.kobj), "master");
5001
5002         if (netdev_adjacent_is_neigh_list(dev, adj_dev, dev_list))
5003                 netdev_adjacent_sysfs_del(dev, adj_dev->name, dev_list);
5004
5005         list_del_rcu(&adj->list);
5006         pr_debug("dev_put for %s, because link removed from %s to %s\n",
5007                  adj_dev->name, dev->name, adj_dev->name);
5008         dev_put(adj_dev);
5009         kfree_rcu(adj, rcu);
5010 }
5011
5012 static int __netdev_adjacent_dev_link_lists(struct net_device *dev,
5013                                             struct net_device *upper_dev,
5014                                             u16 ref_nr,
5015                                             struct list_head *up_list,
5016                                             struct list_head *down_list,
5017                                             void *private, bool master)
5018 {
5019         int ret;
5020
5021         ret = __netdev_adjacent_dev_insert(dev, upper_dev, ref_nr, up_list,
5022                                            private, master);
5023         if (ret)
5024                 return ret;
5025
5026         ret = __netdev_adjacent_dev_insert(upper_dev, dev, ref_nr, down_list,
5027                                            private, false);
5028         if (ret) {
5029                 __netdev_adjacent_dev_remove(dev, upper_dev, ref_nr, up_list);
5030                 return ret;
5031         }
5032
5033         return 0;
5034 }
5035
5036 static int __netdev_adjacent_dev_link(struct net_device *dev,
5037                                       struct net_device *upper_dev,
5038                                       u16 ref_nr)
5039 {
5040         return __netdev_adjacent_dev_link_lists(dev, upper_dev, ref_nr,
5041                                                 &dev->all_adj_list.upper,
5042                                                 &upper_dev->all_adj_list.lower,
5043                                                 NULL, false);
5044 }
5045
5046 static void __netdev_adjacent_dev_unlink_lists(struct net_device *dev,
5047                                                struct net_device *upper_dev,
5048                                                u16 ref_nr,
5049                                                struct list_head *up_list,
5050                                                struct list_head *down_list)
5051 {
5052         __netdev_adjacent_dev_remove(dev, upper_dev, ref_nr, up_list);
5053         __netdev_adjacent_dev_remove(upper_dev, dev, ref_nr, down_list);
5054 }
5055
5056 static void __netdev_adjacent_dev_unlink(struct net_device *dev,
5057                                          struct net_device *upper_dev,
5058                                          u16 ref_nr)
5059 {
5060         __netdev_adjacent_dev_unlink_lists(dev, upper_dev, ref_nr,
5061                                            &dev->all_adj_list.upper,
5062                                            &upper_dev->all_adj_list.lower);
5063 }
5064
5065 static int __netdev_adjacent_dev_link_neighbour(struct net_device *dev,
5066                                                 struct net_device *upper_dev,
5067                                                 void *private, bool master)
5068 {
5069         int ret = __netdev_adjacent_dev_link(dev, upper_dev, 1);
5070
5071         if (ret)
5072                 return ret;
5073
5074         ret = __netdev_adjacent_dev_link_lists(dev, upper_dev, 1,
5075                                                &dev->adj_list.upper,
5076                                                &upper_dev->adj_list.lower,
5077                                                private, master);
5078         if (ret) {
5079                 __netdev_adjacent_dev_unlink(dev, upper_dev, 1);
5080                 return ret;
5081         }
5082
5083         return 0;
5084 }
5085
5086 static void __netdev_adjacent_dev_unlink_neighbour(struct net_device *dev,
5087                                                    struct net_device *upper_dev)
5088 {
5089         __netdev_adjacent_dev_unlink(dev, upper_dev, 1);
5090         __netdev_adjacent_dev_unlink_lists(dev, upper_dev, 1,
5091                                            &dev->adj_list.upper,
5092                                            &upper_dev->adj_list.lower);
5093 }
5094
5095 static int __netdev_upper_dev_link(struct net_device *dev,
5096                                    struct net_device *upper_dev, bool master,
5097                                    void *private)
5098 {
5099         struct netdev_adjacent *i, *j, *to_i, *to_j;
5100         int ret = 0;
5101
5102         ASSERT_RTNL();
5103
5104         if (dev == upper_dev)
5105                 return -EBUSY;
5106
5107         /* To prevent loops, check if dev is not upper device to upper_dev. */
5108         if (__netdev_find_adj(upper_dev, dev, &upper_dev->all_adj_list.upper))
5109                 return -EBUSY;
5110
5111         if (__netdev_find_adj(dev, upper_dev, &dev->adj_list.upper))
5112                 return -EEXIST;
5113
5114         if (master && netdev_master_upper_dev_get(dev))
5115                 return -EBUSY;
5116
5117         ret = __netdev_adjacent_dev_link_neighbour(dev, upper_dev, private,
5118                                                    master);
5119         if (ret)
5120                 return ret;
5121
5122         /* Now that we linked these devs, make all the upper_dev's
5123          * all_adj_list.upper visible to every dev's all_adj_list.lower an
5124          * versa, and don't forget the devices itself. All of these
5125          * links are non-neighbours.
5126          */
5127         list_for_each_entry(i, &dev->all_adj_list.lower, list) {
5128                 list_for_each_entry(j, &upper_dev->all_adj_list.upper, list) {
5129                         pr_debug("Interlinking %s with %s, non-neighbour\n",
5130                                  i->dev->name, j->dev->name);
5131                         ret = __netdev_adjacent_dev_link(i->dev, j->dev, i->ref_nr);
5132                         if (ret)
5133                                 goto rollback_mesh;
5134                 }
5135         }
5136
5137         /* add dev to every upper_dev's upper device */
5138         list_for_each_entry(i, &upper_dev->all_adj_list.upper, list) {
5139                 pr_debug("linking %s's upper device %s with %s\n",
5140                          upper_dev->name, i->dev->name, dev->name);
5141                 ret = __netdev_adjacent_dev_link(dev, i->dev, i->ref_nr);
5142                 if (ret)
5143                         goto rollback_upper_mesh;
5144         }
5145
5146         /* add upper_dev to every dev's lower device */
5147         list_for_each_entry(i, &dev->all_adj_list.lower, list) {
5148                 pr_debug("linking %s's lower device %s with %s\n", dev->name,
5149                          i->dev->name, upper_dev->name);
5150                 ret = __netdev_adjacent_dev_link(i->dev, upper_dev, i->ref_nr);
5151                 if (ret)
5152                         goto rollback_lower_mesh;
5153         }
5154
5155         call_netdevice_notifiers(NETDEV_CHANGEUPPER, dev);
5156         return 0;
5157
5158 rollback_lower_mesh:
5159         to_i = i;
5160         list_for_each_entry(i, &dev->all_adj_list.lower, list) {
5161                 if (i == to_i)
5162                         break;
5163                 __netdev_adjacent_dev_unlink(i->dev, upper_dev, i->ref_nr);
5164         }
5165
5166         i = NULL;
5167
5168 rollback_upper_mesh:
5169         to_i = i;
5170         list_for_each_entry(i, &upper_dev->all_adj_list.upper, list) {
5171                 if (i == to_i)
5172                         break;
5173                 __netdev_adjacent_dev_unlink(dev, i->dev, i->ref_nr);
5174         }
5175
5176         i = j = NULL;
5177
5178 rollback_mesh:
5179         to_i = i;
5180         to_j = j;
5181         list_for_each_entry(i, &dev->all_adj_list.lower, list) {
5182                 list_for_each_entry(j, &upper_dev->all_adj_list.upper, list) {
5183                         if (i == to_i && j == to_j)
5184                                 break;
5185                         __netdev_adjacent_dev_unlink(i->dev, j->dev, i->ref_nr);
5186                 }
5187                 if (i == to_i)
5188                         break;
5189         }
5190
5191         __netdev_adjacent_dev_unlink_neighbour(dev, upper_dev);
5192
5193         return ret;
5194 }
5195
5196 /**
5197  * netdev_upper_dev_link - Add a link to the upper device
5198  * @dev: device
5199  * @upper_dev: new upper device
5200  *
5201  * Adds a link to device which is upper to this one. The caller must hold
5202  * the RTNL lock. On a failure a negative errno code is returned.
5203  * On success the reference counts are adjusted and the function
5204  * returns zero.
5205  */
5206 int netdev_upper_dev_link(struct net_device *dev,
5207                           struct net_device *upper_dev)
5208 {
5209         return __netdev_upper_dev_link(dev, upper_dev, false, NULL);
5210 }
5211 EXPORT_SYMBOL(netdev_upper_dev_link);
5212
5213 /**
5214  * netdev_master_upper_dev_link - Add a master link to the upper device
5215  * @dev: device
5216  * @upper_dev: new upper device
5217  *
5218  * Adds a link to device which is upper to this one. In this case, only
5219  * one master upper device can be linked, although other non-master devices
5220  * might be linked as well. The caller must hold the RTNL lock.
5221  * On a failure a negative errno code is returned. On success the reference
5222  * counts are adjusted and the function returns zero.
5223  */
5224 int netdev_master_upper_dev_link(struct net_device *dev,
5225                                  struct net_device *upper_dev)
5226 {
5227         return __netdev_upper_dev_link(dev, upper_dev, true, NULL);
5228 }
5229 EXPORT_SYMBOL(netdev_master_upper_dev_link);
5230
5231 int netdev_master_upper_dev_link_private(struct net_device *dev,
5232                                          struct net_device *upper_dev,
5233                                          void *private)
5234 {
5235         return __netdev_upper_dev_link(dev, upper_dev, true, private);
5236 }
5237 EXPORT_SYMBOL(netdev_master_upper_dev_link_private);
5238
5239 /**
5240  * netdev_upper_dev_unlink - Removes a link to upper device
5241  * @dev: device
5242  * @upper_dev: new upper device
5243  *
5244  * Removes a link to device which is upper to this one. The caller must hold
5245  * the RTNL lock.
5246  */
5247 void netdev_upper_dev_unlink(struct net_device *dev,
5248                              struct net_device *upper_dev)
5249 {
5250         struct netdev_adjacent *i, *j;
5251         ASSERT_RTNL();
5252
5253         __netdev_adjacent_dev_unlink_neighbour(dev, upper_dev);
5254
5255         /* Here is the tricky part. We must remove all dev's lower
5256          * devices from all upper_dev's upper devices and vice
5257          * versa, to maintain the graph relationship.
5258          */
5259         list_for_each_entry(i, &dev->all_adj_list.lower, list)
5260                 list_for_each_entry(j, &upper_dev->all_adj_list.upper, list)
5261                         __netdev_adjacent_dev_unlink(i->dev, j->dev, i->ref_nr);
5262
5263         /* remove also the devices itself from lower/upper device
5264          * list
5265          */
5266         list_for_each_entry(i, &dev->all_adj_list.lower, list)
5267                 __netdev_adjacent_dev_unlink(i->dev, upper_dev, i->ref_nr);
5268
5269         list_for_each_entry(i, &upper_dev->all_adj_list.upper, list)
5270                 __netdev_adjacent_dev_unlink(dev, i->dev, i->ref_nr);
5271
5272         call_netdevice_notifiers(NETDEV_CHANGEUPPER, dev);
5273 }
5274 EXPORT_SYMBOL(netdev_upper_dev_unlink);
5275
5276 void netdev_adjacent_add_links(struct net_device *dev)
5277 {
5278         struct netdev_adjacent *iter;
5279
5280         struct net *net = dev_net(dev);
5281
5282         list_for_each_entry(iter, &dev->adj_list.upper, list) {
5283                 if (!net_eq(net,dev_net(iter->dev)))
5284                         continue;
5285                 netdev_adjacent_sysfs_add(iter->dev, dev,
5286                                           &iter->dev->adj_list.lower);
5287                 netdev_adjacent_sysfs_add(dev, iter->dev,
5288                                           &dev->adj_list.upper);
5289         }
5290
5291         list_for_each_entry(iter, &dev->adj_list.lower, list) {
5292                 if (!net_eq(net,dev_net(iter->dev)))
5293                         continue;
5294                 netdev_adjacent_sysfs_add(iter->dev, dev,
5295                                           &iter->dev->adj_list.upper);
5296                 netdev_adjacent_sysfs_add(dev, iter->dev,
5297                                           &dev->adj_list.lower);
5298         }
5299 }
5300
5301 void netdev_adjacent_del_links(struct net_device *dev)
5302 {
5303         struct netdev_adjacent *iter;
5304
5305         struct net *net = dev_net(dev);
5306
5307         list_for_each_entry(iter, &dev->adj_list.upper, list) {
5308                 if (!net_eq(net,dev_net(iter->dev)))
5309                         continue;
5310                 netdev_adjacent_sysfs_del(iter->dev, dev->name,
5311                                           &iter->dev->adj_list.lower);
5312                 netdev_adjacent_sysfs_del(dev, iter->dev->name,
5313                                           &dev->adj_list.upper);
5314         }
5315
5316         list_for_each_entry(iter, &dev->adj_list.lower, list) {
5317                 if (!net_eq(net,dev_net(iter->dev)))
5318                         continue;
5319                 netdev_adjacent_sysfs_del(iter->dev, dev->name,
5320                                           &iter->dev->adj_list.upper);
5321                 netdev_adjacent_sysfs_del(dev, iter->dev->name,
5322                                           &dev->adj_list.lower);
5323         }
5324 }
5325
5326 void netdev_adjacent_rename_links(struct net_device *dev, char *oldname)
5327 {
5328         struct netdev_adjacent *iter;
5329
5330         struct net *net = dev_net(dev);
5331
5332         list_for_each_entry(iter, &dev->adj_list.upper, list) {
5333                 if (!net_eq(net,dev_net(iter->dev)))
5334                         continue;
5335                 netdev_adjacent_sysfs_del(iter->dev, oldname,
5336                                           &iter->dev->adj_list.lower);
5337                 netdev_adjacent_sysfs_add(iter->dev, dev,
5338                                           &iter->dev->adj_list.lower);
5339         }
5340
5341         list_for_each_entry(iter, &dev->adj_list.lower, list) {
5342                 if (!net_eq(net,dev_net(iter->dev)))
5343                         continue;
5344                 netdev_adjacent_sysfs_del(iter->dev, oldname,
5345                                           &iter->dev->adj_list.upper);
5346                 netdev_adjacent_sysfs_add(iter->dev, dev,
5347                                           &iter->dev->adj_list.upper);
5348         }
5349 }
5350
5351 void *netdev_lower_dev_get_private(struct net_device *dev,
5352                                    struct net_device *lower_dev)
5353 {
5354         struct netdev_adjacent *lower;
5355
5356         if (!lower_dev)
5357                 return NULL;
5358         lower = __netdev_find_adj(dev, lower_dev, &dev->adj_list.lower);
5359         if (!lower)
5360                 return NULL;
5361
5362         return lower->private;
5363 }
5364 EXPORT_SYMBOL(netdev_lower_dev_get_private);
5365
5366
5367 int dev_get_nest_level(struct net_device *dev,
5368                        bool (*type_check)(struct net_device *dev))
5369 {
5370         struct net_device *lower = NULL;
5371         struct list_head *iter;
5372         int max_nest = -1;
5373         int nest;
5374
5375         ASSERT_RTNL();
5376
5377         netdev_for_each_lower_dev(dev, lower, iter) {
5378                 nest = dev_get_nest_level(lower, type_check);
5379                 if (max_nest < nest)
5380                         max_nest = nest;
5381         }
5382
5383         if (type_check(dev))
5384                 max_nest++;
5385
5386         return max_nest;
5387 }
5388 EXPORT_SYMBOL(dev_get_nest_level);
5389
5390 static void dev_change_rx_flags(struct net_device *dev, int flags)
5391 {
5392         const struct net_device_ops *ops = dev->netdev_ops;
5393
5394         if (ops->ndo_change_rx_flags)
5395                 ops->ndo_change_rx_flags(dev, flags);
5396 }
5397
5398 static int __dev_set_promiscuity(struct net_device *dev, int inc, bool notify)
5399 {
5400         unsigned int old_flags = dev->flags;
5401         kuid_t uid;
5402         kgid_t gid;
5403
5404         ASSERT_RTNL();
5405
5406         dev->flags |= IFF_PROMISC;
5407         dev->promiscuity += inc;
5408         if (dev->promiscuity == 0) {
5409                 /*
5410                  * Avoid overflow.
5411                  * If inc causes overflow, untouch promisc and return error.
5412                  */
5413                 if (inc < 0)
5414                         dev->flags &= ~IFF_PROMISC;
5415                 else {
5416                         dev->promiscuity -= inc;
5417                         pr_warn("%s: promiscuity touches roof, set promiscuity failed. promiscuity feature of device might be broken.\n",
5418                                 dev->name);
5419                         return -EOVERFLOW;
5420                 }
5421         }
5422         if (dev->flags != old_flags) {
5423                 pr_info("device %s %s promiscuous mode\n",
5424                         dev->name,
5425                         dev->flags & IFF_PROMISC ? "entered" : "left");
5426                 if (audit_enabled) {
5427                         current_uid_gid(&uid, &gid);
5428                         audit_log(current->audit_context, GFP_ATOMIC,
5429                                 AUDIT_ANOM_PROMISCUOUS,
5430                                 "dev=%s prom=%d old_prom=%d auid=%u uid=%u gid=%u ses=%u",
5431                                 dev->name, (dev->flags & IFF_PROMISC),
5432                                 (old_flags & IFF_PROMISC),
5433                                 from_kuid(&init_user_ns, audit_get_loginuid(current)),
5434                                 from_kuid(&init_user_ns, uid),
5435                                 from_kgid(&init_user_ns, gid),
5436                                 audit_get_sessionid(current));
5437                 }
5438
5439                 dev_change_rx_flags(dev, IFF_PROMISC);
5440         }
5441         if (notify)
5442                 __dev_notify_flags(dev, old_flags, IFF_PROMISC);
5443         return 0;
5444 }
5445
5446 /**
5447  *      dev_set_promiscuity     - update promiscuity count on a device
5448  *      @dev: device
5449  *      @inc: modifier
5450  *
5451  *      Add or remove promiscuity from a device. While the count in the device
5452  *      remains above zero the interface remains promiscuous. Once it hits zero
5453  *      the device reverts back to normal filtering operation. A negative inc
5454  *      value is used to drop promiscuity on the device.
5455  *      Return 0 if successful or a negative errno code on error.
5456  */
5457 int dev_set_promiscuity(struct net_device *dev, int inc)
5458 {
5459         unsigned int old_flags = dev->flags;
5460         int err;
5461
5462         err = __dev_set_promiscuity(dev, inc, true);
5463         if (err < 0)
5464                 return err;
5465         if (dev->flags != old_flags)
5466                 dev_set_rx_mode(dev);
5467         return err;
5468 }
5469 EXPORT_SYMBOL(dev_set_promiscuity);
5470
5471 static int __dev_set_allmulti(struct net_device *dev, int inc, bool notify)
5472 {
5473         unsigned int old_flags = dev->flags, old_gflags = dev->gflags;
5474
5475         ASSERT_RTNL();
5476
5477         dev->flags |= IFF_ALLMULTI;
5478         dev->allmulti += inc;
5479         if (dev->allmulti == 0) {
5480                 /*
5481                  * Avoid overflow.
5482                  * If inc causes overflow, untouch allmulti and return error.
5483                  */
5484                 if (inc < 0)
5485                         dev->flags &= ~IFF_ALLMULTI;
5486                 else {
5487                         dev->allmulti -= inc;
5488                         pr_warn("%s: allmulti touches roof, set allmulti failed. allmulti feature of device might be broken.\n",
5489                                 dev->name);
5490                         return -EOVERFLOW;
5491                 }
5492         }
5493         if (dev->flags ^ old_flags) {
5494                 dev_change_rx_flags(dev, IFF_ALLMULTI);
5495                 dev_set_rx_mode(dev);
5496                 if (notify)
5497                         __dev_notify_flags(dev, old_flags,
5498                                            dev->gflags ^ old_gflags);
5499         }
5500         return 0;
5501 }
5502
5503 /**
5504  *      dev_set_allmulti        - update allmulti count on a device
5505  *      @dev: device
5506  *      @inc: modifier
5507  *
5508  *      Add or remove reception of all multicast frames to a device. While the
5509  *      count in the device remains above zero the interface remains listening
5510  *      to all interfaces. Once it hits zero the device reverts back to normal
5511  *      filtering operation. A negative @inc value is used to drop the counter
5512  *      when releasing a resource needing all multicasts.
5513  *      Return 0 if successful or a negative errno code on error.
5514  */
5515
5516 int dev_set_allmulti(struct net_device *dev, int inc)
5517 {
5518         return __dev_set_allmulti(dev, inc, true);
5519 }
5520 EXPORT_SYMBOL(dev_set_allmulti);
5521
5522 /*
5523  *      Upload unicast and multicast address lists to device and
5524  *      configure RX filtering. When the device doesn't support unicast
5525  *      filtering it is put in promiscuous mode while unicast addresses
5526  *      are present.
5527  */
5528 void __dev_set_rx_mode(struct net_device *dev)
5529 {
5530         const struct net_device_ops *ops = dev->netdev_ops;
5531
5532         /* dev_open will call this function so the list will stay sane. */
5533         if (!(dev->flags&IFF_UP))
5534                 return;
5535
5536         if (!netif_device_present(dev))
5537                 return;
5538
5539         if (!(dev->priv_flags & IFF_UNICAST_FLT)) {
5540                 /* Unicast addresses changes may only happen under the rtnl,
5541                  * therefore calling __dev_set_promiscuity here is safe.
5542                  */
5543                 if (!netdev_uc_empty(dev) && !dev->uc_promisc) {
5544                         __dev_set_promiscuity(dev, 1, false);
5545                         dev->uc_promisc = true;
5546                 } else if (netdev_uc_empty(dev) && dev->uc_promisc) {
5547                         __dev_set_promiscuity(dev, -1, false);
5548                         dev->uc_promisc = false;
5549                 }
5550         }
5551
5552         if (ops->ndo_set_rx_mode)
5553                 ops->ndo_set_rx_mode(dev);
5554 }
5555
5556 void dev_set_rx_mode(struct net_device *dev)
5557 {
5558         netif_addr_lock_bh(dev);
5559         __dev_set_rx_mode(dev);
5560         netif_addr_unlock_bh(dev);
5561 }
5562
5563 /**
5564  *      dev_get_flags - get flags reported to userspace
5565  *      @dev: device
5566  *
5567  *      Get the combination of flag bits exported through APIs to userspace.
5568  */
5569 unsigned int dev_get_flags(const struct net_device *dev)
5570 {
5571         unsigned int flags;
5572
5573         flags = (dev->flags & ~(IFF_PROMISC |
5574                                 IFF_ALLMULTI |
5575                                 IFF_RUNNING |
5576                                 IFF_LOWER_UP |
5577                                 IFF_DORMANT)) |
5578                 (dev->gflags & (IFF_PROMISC |
5579                                 IFF_ALLMULTI));
5580
5581         if (netif_running(dev)) {
5582                 if (netif_oper_up(dev))
5583                         flags |= IFF_RUNNING;
5584                 if (netif_carrier_ok(dev))
5585                         flags |= IFF_LOWER_UP;
5586                 if (netif_dormant(dev))
5587                         flags |= IFF_DORMANT;
5588         }
5589
5590         return flags;
5591 }
5592 EXPORT_SYMBOL(dev_get_flags);
5593
5594 int __dev_change_flags(struct net_device *dev, unsigned int flags)
5595 {
5596         unsigned int old_flags = dev->flags;
5597         int ret;
5598
5599         ASSERT_RTNL();
5600
5601         /*
5602          *      Set the flags on our device.
5603          */
5604
5605         dev->flags = (flags & (IFF_DEBUG | IFF_NOTRAILERS | IFF_NOARP |
5606                                IFF_DYNAMIC | IFF_MULTICAST | IFF_PORTSEL |
5607                                IFF_AUTOMEDIA)) |
5608                      (dev->flags & (IFF_UP | IFF_VOLATILE | IFF_PROMISC |
5609                                     IFF_ALLMULTI));
5610
5611         /*
5612          *      Load in the correct multicast list now the flags have changed.
5613          */
5614
5615         if ((old_flags ^ flags) & IFF_MULTICAST)
5616                 dev_change_rx_flags(dev, IFF_MULTICAST);
5617
5618         dev_set_rx_mode(dev);
5619
5620         /*
5621          *      Have we downed the interface. We handle IFF_UP ourselves
5622          *      according to user attempts to set it, rather than blindly
5623          *      setting it.
5624          */
5625
5626         ret = 0;
5627         if ((old_flags ^ flags) & IFF_UP)
5628                 ret = ((old_flags & IFF_UP) ? __dev_close : __dev_open)(dev);
5629
5630         if ((flags ^ dev->gflags) & IFF_PROMISC) {
5631                 int inc = (flags & IFF_PROMISC) ? 1 : -1;
5632                 unsigned int old_flags = dev->flags;
5633
5634                 dev->gflags ^= IFF_PROMISC;
5635
5636                 if (__dev_set_promiscuity(dev, inc, false) >= 0)
5637                         if (dev->flags != old_flags)
5638                                 dev_set_rx_mode(dev);
5639         }
5640
5641         /* NOTE: order of synchronization of IFF_PROMISC and IFF_ALLMULTI
5642            is important. Some (broken) drivers set IFF_PROMISC, when
5643            IFF_ALLMULTI is requested not asking us and not reporting.
5644          */
5645         if ((flags ^ dev->gflags) & IFF_ALLMULTI) {
5646                 int inc = (flags & IFF_ALLMULTI) ? 1 : -1;
5647
5648                 dev->gflags ^= IFF_ALLMULTI;
5649                 __dev_set_allmulti(dev, inc, false);
5650         }
5651
5652         return ret;
5653 }
5654
5655 void __dev_notify_flags(struct net_device *dev, unsigned int old_flags,
5656                         unsigned int gchanges)
5657 {
5658         unsigned int changes = dev->flags ^ old_flags;
5659
5660         if (gchanges)
5661                 rtmsg_ifinfo(RTM_NEWLINK, dev, gchanges, GFP_ATOMIC);
5662
5663         if (changes & IFF_UP) {
5664                 if (dev->flags & IFF_UP)
5665                         call_netdevice_notifiers(NETDEV_UP, dev);
5666                 else
5667                         call_netdevice_notifiers(NETDEV_DOWN, dev);
5668         }
5669
5670         if (dev->flags & IFF_UP &&
5671             (changes & ~(IFF_UP | IFF_PROMISC | IFF_ALLMULTI | IFF_VOLATILE))) {
5672                 struct netdev_notifier_change_info change_info;
5673
5674                 change_info.flags_changed = changes;
5675                 call_netdevice_notifiers_info(NETDEV_CHANGE, dev,
5676                                               &change_info.info);
5677         }
5678 }
5679
5680 /**
5681  *      dev_change_flags - change device settings
5682  *      @dev: device
5683  *      @flags: device state flags
5684  *
5685  *      Change settings on device based state flags. The flags are
5686  *      in the userspace exported format.
5687  */
5688 int dev_change_flags(struct net_device *dev, unsigned int flags)
5689 {
5690         int ret;
5691         unsigned int changes, old_flags = dev->flags, old_gflags = dev->gflags;
5692
5693         ret = __dev_change_flags(dev, flags);
5694         if (ret < 0)
5695                 return ret;
5696
5697         changes = (old_flags ^ dev->flags) | (old_gflags ^ dev->gflags);
5698         __dev_notify_flags(dev, old_flags, changes);
5699         return ret;
5700 }
5701 EXPORT_SYMBOL(dev_change_flags);
5702
5703 static int __dev_set_mtu(struct net_device *dev, int new_mtu)
5704 {
5705         const struct net_device_ops *ops = dev->netdev_ops;
5706
5707         if (ops->ndo_change_mtu)
5708                 return ops->ndo_change_mtu(dev, new_mtu);
5709
5710         dev->mtu = new_mtu;
5711         return 0;
5712 }
5713
5714 /**
5715  *      dev_set_mtu - Change maximum transfer unit
5716  *      @dev: device
5717  *      @new_mtu: new transfer unit
5718  *
5719  *      Change the maximum transfer size of the network device.
5720  */
5721 int dev_set_mtu(struct net_device *dev, int new_mtu)
5722 {
5723         int err, orig_mtu;
5724
5725         if (new_mtu == dev->mtu)
5726                 return 0;
5727
5728         /*      MTU must be positive.    */
5729         if (new_mtu < 0)
5730                 return -EINVAL;
5731
5732         if (!netif_device_present(dev))
5733                 return -ENODEV;
5734
5735         err = call_netdevice_notifiers(NETDEV_PRECHANGEMTU, dev);
5736         err = notifier_to_errno(err);
5737         if (err)
5738                 return err;
5739
5740         orig_mtu = dev->mtu;
5741         err = __dev_set_mtu(dev, new_mtu);
5742
5743         if (!err) {
5744                 err = call_netdevice_notifiers(NETDEV_CHANGEMTU, dev);
5745                 err = notifier_to_errno(err);
5746                 if (err) {
5747                         /* setting mtu back and notifying everyone again,
5748                          * so that they have a chance to revert changes.
5749                          */
5750                         __dev_set_mtu(dev, orig_mtu);
5751                         call_netdevice_notifiers(NETDEV_CHANGEMTU, dev);
5752                 }
5753         }
5754         return err;
5755 }
5756 EXPORT_SYMBOL(dev_set_mtu);
5757
5758 /**
5759  *      dev_set_group - Change group this device belongs to
5760  *      @dev: device
5761  *      @new_group: group this device should belong to
5762  */
5763 void dev_set_group(struct net_device *dev, int new_group)
5764 {
5765         dev->group = new_group;
5766 }
5767 EXPORT_SYMBOL(dev_set_group);
5768
5769 /**
5770  *      dev_set_mac_address - Change Media Access Control Address
5771  *      @dev: device
5772  *      @sa: new address
5773  *
5774  *      Change the hardware (MAC) address of the device
5775  */
5776 int dev_set_mac_address(struct net_device *dev, struct sockaddr *sa)
5777 {
5778         const struct net_device_ops *ops = dev->netdev_ops;
5779         int err;
5780
5781         if (!ops->ndo_set_mac_address)
5782                 return -EOPNOTSUPP;
5783         if (sa->sa_family != dev->type)
5784                 return -EINVAL;
5785         if (!netif_device_present(dev))
5786                 return -ENODEV;
5787         err = ops->ndo_set_mac_address(dev, sa);
5788         if (err)
5789                 return err;
5790         dev->addr_assign_type = NET_ADDR_SET;
5791         call_netdevice_notifiers(NETDEV_CHANGEADDR, dev);
5792         add_device_randomness(dev->dev_addr, dev->addr_len);
5793         return 0;
5794 }
5795 EXPORT_SYMBOL(dev_set_mac_address);
5796
5797 /**
5798  *      dev_change_carrier - Change device carrier
5799  *      @dev: device
5800  *      @new_carrier: new value
5801  *
5802  *      Change device carrier
5803  */
5804 int dev_change_carrier(struct net_device *dev, bool new_carrier)
5805 {
5806         const struct net_device_ops *ops = dev->netdev_ops;
5807
5808         if (!ops->ndo_change_carrier)
5809                 return -EOPNOTSUPP;
5810         if (!netif_device_present(dev))
5811                 return -ENODEV;
5812         return ops->ndo_change_carrier(dev, new_carrier);
5813 }
5814 EXPORT_SYMBOL(dev_change_carrier);
5815
5816 /**
5817  *      dev_get_phys_port_id - Get device physical port ID
5818  *      @dev: device
5819  *      @ppid: port ID
5820  *
5821  *      Get device physical port ID
5822  */
5823 int dev_get_phys_port_id(struct net_device *dev,
5824                          struct netdev_phys_port_id *ppid)
5825 {
5826         const struct net_device_ops *ops = dev->netdev_ops;
5827
5828         if (!ops->ndo_get_phys_port_id)
5829                 return -EOPNOTSUPP;
5830         return ops->ndo_get_phys_port_id(dev, ppid);
5831 }
5832 EXPORT_SYMBOL(dev_get_phys_port_id);
5833
5834 /**
5835  *      dev_new_index   -       allocate an ifindex
5836  *      @net: the applicable net namespace
5837  *
5838  *      Returns a suitable unique value for a new device interface
5839  *      number.  The caller must hold the rtnl semaphore or the
5840  *      dev_base_lock to be sure it remains unique.
5841  */
5842 static int dev_new_index(struct net *net)
5843 {
5844         int ifindex = net->ifindex;
5845         for (;;) {
5846                 if (++ifindex <= 0)
5847                         ifindex = 1;
5848                 if (!__dev_get_by_index(net, ifindex))
5849                         return net->ifindex = ifindex;
5850         }
5851 }
5852
5853 /* Delayed registration/unregisteration */
5854 static LIST_HEAD(net_todo_list);
5855 DECLARE_WAIT_QUEUE_HEAD(netdev_unregistering_wq);
5856
5857 static void net_set_todo(struct net_device *dev)
5858 {
5859         list_add_tail(&dev->todo_list, &net_todo_list);
5860         dev_net(dev)->dev_unreg_count++;
5861 }
5862
5863 static void rollback_registered_many(struct list_head *head)
5864 {
5865         struct net_device *dev, *tmp;
5866         LIST_HEAD(close_head);
5867
5868         BUG_ON(dev_boot_phase);
5869         ASSERT_RTNL();
5870
5871         list_for_each_entry_safe(dev, tmp, head, unreg_list) {
5872                 /* Some devices call without registering
5873                  * for initialization unwind. Remove those
5874                  * devices and proceed with the remaining.
5875                  */
5876                 if (dev->reg_state == NETREG_UNINITIALIZED) {
5877                         pr_debug("unregister_netdevice: device %s/%p never was registered\n",
5878                                  dev->name, dev);
5879
5880                         WARN_ON(1);
5881                         list_del(&dev->unreg_list);
5882                         continue;
5883                 }
5884                 dev->dismantle = true;
5885                 BUG_ON(dev->reg_state != NETREG_REGISTERED);
5886         }
5887
5888         /* If device is running, close it first. */
5889         list_for_each_entry(dev, head, unreg_list)
5890                 list_add_tail(&dev->close_list, &close_head);
5891         dev_close_many(&close_head);
5892
5893         list_for_each_entry(dev, head, unreg_list) {
5894                 /* And unlink it from device chain. */
5895                 unlist_netdevice(dev);
5896
5897                 dev->reg_state = NETREG_UNREGISTERING;
5898                 on_each_cpu(flush_backlog, dev, 1);
5899         }
5900
5901         synchronize_net();
5902
5903         list_for_each_entry(dev, head, unreg_list) {
5904                 /* Shutdown queueing discipline. */
5905                 dev_shutdown(dev);
5906
5907
5908                 /* Notify protocols, that we are about to destroy
5909                    this device. They should clean all the things.
5910                 */
5911                 call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
5912
5913                 /*
5914                  *      Flush the unicast and multicast chains
5915                  */
5916                 dev_uc_flush(dev);
5917                 dev_mc_flush(dev);
5918
5919                 if (dev->netdev_ops->ndo_uninit)
5920                         dev->netdev_ops->ndo_uninit(dev);
5921
5922                 if (!dev->rtnl_link_ops ||
5923                     dev->rtnl_link_state == RTNL_LINK_INITIALIZED)
5924                         rtmsg_ifinfo(RTM_DELLINK, dev, ~0U, GFP_KERNEL);
5925
5926                 /* Notifier chain MUST detach us all upper devices. */
5927                 WARN_ON(netdev_has_any_upper_dev(dev));
5928
5929                 /* Remove entries from kobject tree */
5930                 netdev_unregister_kobject(dev);
5931 #ifdef CONFIG_XPS
5932                 /* Remove XPS queueing entries */
5933                 netif_reset_xps_queues_gt(dev, 0);
5934 #endif
5935         }
5936
5937         synchronize_net();
5938
5939         list_for_each_entry(dev, head, unreg_list)
5940                 dev_put(dev);
5941 }
5942
5943 static void rollback_registered(struct net_device *dev)
5944 {
5945         LIST_HEAD(single);
5946
5947         list_add(&dev->unreg_list, &single);
5948         rollback_registered_many(&single);
5949         list_del(&single);
5950 }
5951
5952 static netdev_features_t netdev_fix_features(struct net_device *dev,
5953         netdev_features_t features)
5954 {
5955         /* Fix illegal checksum combinations */
5956         if ((features & NETIF_F_HW_CSUM) &&
5957             (features & (NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))) {
5958                 netdev_warn(dev, "mixed HW and IP checksum settings.\n");
5959                 features &= ~(NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM);
5960         }
5961
5962         /* TSO requires that SG is present as well. */
5963         if ((features & NETIF_F_ALL_TSO) && !(features & NETIF_F_SG)) {
5964                 netdev_dbg(dev, "Dropping TSO features since no SG feature.\n");
5965                 features &= ~NETIF_F_ALL_TSO;
5966         }
5967
5968         if ((features & NETIF_F_TSO) && !(features & NETIF_F_HW_CSUM) &&
5969                                         !(features & NETIF_F_IP_CSUM)) {
5970                 netdev_dbg(dev, "Dropping TSO features since no CSUM feature.\n");
5971                 features &= ~NETIF_F_TSO;
5972                 features &= ~NETIF_F_TSO_ECN;
5973         }
5974
5975         if ((features & NETIF_F_TSO6) && !(features & NETIF_F_HW_CSUM) &&
5976                                          !(features & NETIF_F_IPV6_CSUM)) {
5977                 netdev_dbg(dev, "Dropping TSO6 features since no CSUM feature.\n");
5978                 features &= ~NETIF_F_TSO6;
5979         }
5980
5981         /* TSO ECN requires that TSO is present as well. */
5982         if ((features & NETIF_F_ALL_TSO) == NETIF_F_TSO_ECN)
5983                 features &= ~NETIF_F_TSO_ECN;
5984
5985         /* Software GSO depends on SG. */
5986         if ((features & NETIF_F_GSO) && !(features & NETIF_F_SG)) {
5987                 netdev_dbg(dev, "Dropping NETIF_F_GSO since no SG feature.\n");
5988                 features &= ~NETIF_F_GSO;
5989         }
5990
5991         /* UFO needs SG and checksumming */
5992         if (features & NETIF_F_UFO) {
5993                 /* maybe split UFO into V4 and V6? */
5994                 if (!((features & NETIF_F_GEN_CSUM) ||
5995                     (features & (NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))
5996                             == (NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))) {
5997                         netdev_dbg(dev,
5998                                 "Dropping NETIF_F_UFO since no checksum offload features.\n");
5999                         features &= ~NETIF_F_UFO;
6000                 }
6001
6002                 if (!(features & NETIF_F_SG)) {
6003                         netdev_dbg(dev,
6004                                 "Dropping NETIF_F_UFO since no NETIF_F_SG feature.\n");
6005                         features &= ~NETIF_F_UFO;
6006                 }
6007         }
6008
6009 #ifdef CONFIG_NET_RX_BUSY_POLL
6010         if (dev->netdev_ops->ndo_busy_poll)
6011                 features |= NETIF_F_BUSY_POLL;
6012         else
6013 #endif
6014                 features &= ~NETIF_F_BUSY_POLL;
6015
6016         return features;
6017 }
6018
6019 int __netdev_update_features(struct net_device *dev)
6020 {
6021         netdev_features_t features;
6022         int err = 0;
6023
6024         ASSERT_RTNL();
6025
6026         features = netdev_get_wanted_features(dev);
6027
6028         if (dev->netdev_ops->ndo_fix_features)
6029                 features = dev->netdev_ops->ndo_fix_features(dev, features);
6030
6031         /* driver might be less strict about feature dependencies */
6032         features = netdev_fix_features(dev, features);
6033
6034         if (dev->features == features)
6035                 return 0;
6036
6037         netdev_dbg(dev, "Features changed: %pNF -> %pNF\n",
6038                 &dev->features, &features);
6039
6040         if (dev->netdev_ops->ndo_set_features)
6041                 err = dev->netdev_ops->ndo_set_features(dev, features);
6042
6043         if (unlikely(err < 0)) {
6044                 netdev_err(dev,
6045                         "set_features() failed (%d); wanted %pNF, left %pNF\n",
6046                         err, &features, &dev->features);
6047                 return -1;
6048         }
6049
6050         if (!err)
6051                 dev->features = features;
6052
6053         return 1;
6054 }
6055
6056 /**
6057  *      netdev_update_features - recalculate device features
6058  *      @dev: the device to check
6059  *
6060  *      Recalculate dev->features set and send notifications if it
6061  *      has changed. Should be called after driver or hardware dependent
6062  *      conditions might have changed that influence the features.
6063  */
6064 void netdev_update_features(struct net_device *dev)
6065 {
6066         if (__netdev_update_features(dev))
6067                 netdev_features_change(dev);
6068 }
6069 EXPORT_SYMBOL(netdev_update_features);
6070
6071 /**
6072  *      netdev_change_features - recalculate device features
6073  *      @dev: the device to check
6074  *
6075  *      Recalculate dev->features set and send notifications even
6076  *      if they have not changed. Should be called instead of
6077  *      netdev_update_features() if also dev->vlan_features might
6078  *      have changed to allow the changes to be propagated to stacked
6079  *      VLAN devices.
6080  */
6081 void netdev_change_features(struct net_device *dev)
6082 {
6083         __netdev_update_features(dev);
6084         netdev_features_change(dev);
6085 }
6086 EXPORT_SYMBOL(netdev_change_features);
6087
6088 /**
6089  *      netif_stacked_transfer_operstate -      transfer operstate
6090  *      @rootdev: the root or lower level device to transfer state from
6091  *      @dev: the device to transfer operstate to
6092  *
6093  *      Transfer operational state from root to device. This is normally
6094  *      called when a stacking relationship exists between the root
6095  *      device and the device(a leaf device).
6096  */
6097 void netif_stacked_transfer_operstate(const struct net_device *rootdev,
6098                                         struct net_device *dev)
6099 {
6100         if (rootdev->operstate == IF_OPER_DORMANT)
6101                 netif_dormant_on(dev);
6102         else
6103                 netif_dormant_off(dev);
6104
6105         if (netif_carrier_ok(rootdev)) {
6106                 if (!netif_carrier_ok(dev))
6107                         netif_carrier_on(dev);
6108         } else {
6109                 if (netif_carrier_ok(dev))
6110                         netif_carrier_off(dev);
6111         }
6112 }
6113 EXPORT_SYMBOL(netif_stacked_transfer_operstate);
6114
6115 #ifdef CONFIG_SYSFS
6116 static int netif_alloc_rx_queues(struct net_device *dev)
6117 {
6118         unsigned int i, count = dev->num_rx_queues;
6119         struct netdev_rx_queue *rx;
6120
6121         BUG_ON(count < 1);
6122
6123         rx = kcalloc(count, sizeof(struct netdev_rx_queue), GFP_KERNEL);
6124         if (!rx)
6125                 return -ENOMEM;
6126
6127         dev->_rx = rx;
6128
6129         for (i = 0; i < count; i++)
6130                 rx[i].dev = dev;
6131         return 0;
6132 }
6133 #endif
6134
6135 static void netdev_init_one_queue(struct net_device *dev,
6136                                   struct netdev_queue *queue, void *_unused)
6137 {
6138         /* Initialize queue lock */
6139         spin_lock_init(&queue->_xmit_lock);
6140         netdev_set_xmit_lockdep_class(&queue->_xmit_lock, dev->type);
6141         queue->xmit_lock_owner = -1;
6142         netdev_queue_numa_node_write(queue, NUMA_NO_NODE);
6143         queue->dev = dev;
6144 #ifdef CONFIG_BQL
6145         dql_init(&queue->dql, HZ);
6146 #endif
6147 }
6148
6149 static void netif_free_tx_queues(struct net_device *dev)
6150 {
6151         kvfree(dev->_tx);
6152 }
6153
6154 static int netif_alloc_netdev_queues(struct net_device *dev)
6155 {
6156         unsigned int count = dev->num_tx_queues;
6157         struct netdev_queue *tx;
6158         size_t sz = count * sizeof(*tx);
6159
6160         if (count < 1 || count > 0xffff)
6161                 return -EINVAL;
6162
6163         tx = kzalloc(sz, GFP_KERNEL | __GFP_NOWARN | __GFP_REPEAT);
6164         if (!tx) {
6165                 tx = vzalloc(sz);
6166                 if (!tx)
6167                         return -ENOMEM;
6168         }
6169         dev->_tx = tx;
6170
6171         netdev_for_each_tx_queue(dev, netdev_init_one_queue, NULL);
6172         spin_lock_init(&dev->tx_global_lock);
6173
6174         return 0;
6175 }
6176
6177 /**
6178  *      register_netdevice      - register a network device
6179  *      @dev: device to register
6180  *
6181  *      Take a completed network device structure and add it to the kernel
6182  *      interfaces. A %NETDEV_REGISTER message is sent to the netdev notifier
6183  *      chain. 0 is returned on success. A negative errno code is returned
6184  *      on a failure to set up the device, or if the name is a duplicate.
6185  *
6186  *      Callers must hold the rtnl semaphore. You may want
6187  *      register_netdev() instead of this.
6188  *
6189  *      BUGS:
6190  *      The locking appears insufficient to guarantee two parallel registers
6191  *      will not get the same name.
6192  */
6193
6194 int register_netdevice(struct net_device *dev)
6195 {
6196         int ret;
6197         struct net *net = dev_net(dev);
6198
6199         BUG_ON(dev_boot_phase);
6200         ASSERT_RTNL();
6201
6202         might_sleep();
6203
6204         /* When net_device's are persistent, this will be fatal. */
6205         BUG_ON(dev->reg_state != NETREG_UNINITIALIZED);
6206         BUG_ON(!net);
6207
6208         spin_lock_init(&dev->addr_list_lock);
6209         netdev_set_addr_lockdep_class(dev);
6210
6211         dev->iflink = -1;
6212
6213         ret = dev_get_valid_name(net, dev, dev->name);
6214         if (ret < 0)
6215                 goto out;
6216
6217         /* Init, if this function is available */
6218         if (dev->netdev_ops->ndo_init) {
6219                 ret = dev->netdev_ops->ndo_init(dev);
6220                 if (ret) {
6221                         if (ret > 0)
6222                                 ret = -EIO;
6223                         goto out;
6224                 }
6225         }
6226
6227         if (((dev->hw_features | dev->features) &
6228              NETIF_F_HW_VLAN_CTAG_FILTER) &&
6229             (!dev->netdev_ops->ndo_vlan_rx_add_vid ||
6230              !dev->netdev_ops->ndo_vlan_rx_kill_vid)) {
6231                 netdev_WARN(dev, "Buggy VLAN acceleration in driver!\n");
6232                 ret = -EINVAL;
6233                 goto err_uninit;
6234         }
6235
6236         ret = -EBUSY;
6237         if (!dev->ifindex)
6238                 dev->ifindex = dev_new_index(net);
6239         else if (__dev_get_by_index(net, dev->ifindex))
6240                 goto err_uninit;
6241
6242         if (dev->iflink == -1)
6243                 dev->iflink = dev->ifindex;
6244
6245         /* Transfer changeable features to wanted_features and enable
6246          * software offloads (GSO and GRO).
6247          */
6248         dev->hw_features |= NETIF_F_SOFT_FEATURES;
6249         dev->features |= NETIF_F_SOFT_FEATURES;
6250         dev->wanted_features = dev->features & dev->hw_features;
6251
6252         if (!(dev->flags & IFF_LOOPBACK)) {
6253                 dev->hw_features |= NETIF_F_NOCACHE_COPY;
6254         }
6255
6256         /* Make NETIF_F_HIGHDMA inheritable to VLAN devices.
6257          */
6258         dev->vlan_features |= NETIF_F_HIGHDMA;
6259
6260         /* Make NETIF_F_SG inheritable to tunnel devices.
6261          */
6262         dev->hw_enc_features |= NETIF_F_SG;
6263
6264         /* Make NETIF_F_SG inheritable to MPLS.
6265          */
6266         dev->mpls_features |= NETIF_F_SG;
6267
6268         ret = call_netdevice_notifiers(NETDEV_POST_INIT, dev);
6269         ret = notifier_to_errno(ret);
6270         if (ret)
6271                 goto err_uninit;
6272
6273         ret = netdev_register_kobject(dev);
6274         if (ret)
6275                 goto err_uninit;
6276         dev->reg_state = NETREG_REGISTERED;
6277
6278         __netdev_update_features(dev);
6279
6280         /*
6281          *      Default initial state at registry is that the
6282          *      device is present.
6283          */
6284
6285         set_bit(__LINK_STATE_PRESENT, &dev->state);
6286
6287         linkwatch_init_dev(dev);
6288
6289         dev_init_scheduler(dev);
6290         dev_hold(dev);
6291         list_netdevice(dev);
6292         add_device_randomness(dev->dev_addr, dev->addr_len);
6293
6294         /* If the device has permanent device address, driver should
6295          * set dev_addr and also addr_assign_type should be set to
6296          * NET_ADDR_PERM (default value).
6297          */
6298         if (dev->addr_assign_type == NET_ADDR_PERM)
6299                 memcpy(dev->perm_addr, dev->dev_addr, dev->addr_len);
6300
6301         /* Notify protocols, that a new device appeared. */
6302         ret = call_netdevice_notifiers(NETDEV_REGISTER, dev);
6303         ret = notifier_to_errno(ret);
6304         if (ret) {
6305                 rollback_registered(dev);
6306                 dev->reg_state = NETREG_UNREGISTERED;
6307         }
6308         /*
6309          *      Prevent userspace races by waiting until the network
6310          *      device is fully setup before sending notifications.
6311          */
6312         if (!dev->rtnl_link_ops ||
6313             dev->rtnl_link_state == RTNL_LINK_INITIALIZED)
6314                 rtmsg_ifinfo(RTM_NEWLINK, dev, ~0U, GFP_KERNEL);
6315
6316 out:
6317         return ret;
6318
6319 err_uninit:
6320         if (dev->netdev_ops->ndo_uninit)
6321                 dev->netdev_ops->ndo_uninit(dev);
6322         goto out;
6323 }
6324 EXPORT_SYMBOL(register_netdevice);
6325
6326 /**
6327  *      init_dummy_netdev       - init a dummy network device for NAPI
6328  *      @dev: device to init
6329  *
6330  *      This takes a network device structure and initialize the minimum
6331  *      amount of fields so it can be used to schedule NAPI polls without
6332  *      registering a full blown interface. This is to be used by drivers
6333  *      that need to tie several hardware interfaces to a single NAPI
6334  *      poll scheduler due to HW limitations.
6335  */
6336 int init_dummy_netdev(struct net_device *dev)
6337 {
6338         /* Clear everything. Note we don't initialize spinlocks
6339          * are they aren't supposed to be taken by any of the
6340          * NAPI code and this dummy netdev is supposed to be
6341          * only ever used for NAPI polls
6342          */
6343         memset(dev, 0, sizeof(struct net_device));
6344
6345         /* make sure we BUG if trying to hit standard
6346          * register/unregister code path
6347          */
6348         dev->reg_state = NETREG_DUMMY;
6349
6350         /* NAPI wants this */
6351         INIT_LIST_HEAD(&dev->napi_list);
6352
6353         /* a dummy interface is started by default */
6354         set_bit(__LINK_STATE_PRESENT, &dev->state);
6355         set_bit(__LINK_STATE_START, &dev->state);
6356
6357         /* Note : We dont allocate pcpu_refcnt for dummy devices,
6358          * because users of this 'device' dont need to change
6359          * its refcount.
6360          */
6361
6362         return 0;
6363 }
6364 EXPORT_SYMBOL_GPL(init_dummy_netdev);
6365
6366
6367 /**
6368  *      register_netdev - register a network device
6369  *      @dev: device to register
6370  *
6371  *      Take a completed network device structure and add it to the kernel
6372  *      interfaces. A %NETDEV_REGISTER message is sent to the netdev notifier
6373  *      chain. 0 is returned on success. A negative errno code is returned
6374  *      on a failure to set up the device, or if the name is a duplicate.
6375  *
6376  *      This is a wrapper around register_netdevice that takes the rtnl semaphore
6377  *      and expands the device name if you passed a format string to
6378  *      alloc_netdev.
6379  */
6380 int register_netdev(struct net_device *dev)
6381 {
6382         int err;
6383
6384         rtnl_lock();
6385         err = register_netdevice(dev);
6386         rtnl_unlock();
6387         return err;
6388 }
6389 EXPORT_SYMBOL(register_netdev);
6390
6391 int netdev_refcnt_read(const struct net_device *dev)
6392 {
6393         int i, refcnt = 0;
6394
6395         for_each_possible_cpu(i)
6396                 refcnt += *per_cpu_ptr(dev->pcpu_refcnt, i);
6397         return refcnt;
6398 }
6399 EXPORT_SYMBOL(netdev_refcnt_read);
6400
6401 /**
6402  * netdev_wait_allrefs - wait until all references are gone.
6403  * @dev: target net_device
6404  *
6405  * This is called when unregistering network devices.
6406  *
6407  * Any protocol or device that holds a reference should register
6408  * for netdevice notification, and cleanup and put back the
6409  * reference if they receive an UNREGISTER event.
6410  * We can get stuck here if buggy protocols don't correctly
6411  * call dev_put.
6412  */
6413 static void netdev_wait_allrefs(struct net_device *dev)
6414 {
6415         unsigned long rebroadcast_time, warning_time;
6416         int refcnt;
6417
6418         linkwatch_forget_dev(dev);
6419
6420         rebroadcast_time = warning_time = jiffies;
6421         refcnt = netdev_refcnt_read(dev);
6422
6423         while (refcnt != 0) {
6424                 if (time_after(jiffies, rebroadcast_time + 1 * HZ)) {
6425                         rtnl_lock();
6426
6427                         /* Rebroadcast unregister notification */
6428                         call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
6429
6430                         __rtnl_unlock();
6431                         rcu_barrier();
6432                         rtnl_lock();
6433
6434                         call_netdevice_notifiers(NETDEV_UNREGISTER_FINAL, dev);
6435                         if (test_bit(__LINK_STATE_LINKWATCH_PENDING,
6436                                      &dev->state)) {
6437                                 /* We must not have linkwatch events
6438                                  * pending on unregister. If this
6439                                  * happens, we simply run the queue
6440                                  * unscheduled, resulting in a noop
6441                                  * for this device.
6442                                  */
6443                                 linkwatch_run_queue();
6444                         }
6445
6446                         __rtnl_unlock();
6447
6448                         rebroadcast_time = jiffies;
6449                 }
6450
6451                 msleep(250);
6452
6453                 refcnt = netdev_refcnt_read(dev);
6454
6455                 if (time_after(jiffies, warning_time + 10 * HZ)) {
6456                         pr_emerg("unregister_netdevice: waiting for %s to become free. Usage count = %d\n",
6457                                  dev->name, refcnt);
6458                         warning_time = jiffies;
6459                 }
6460         }
6461 }
6462
6463 /* The sequence is:
6464  *
6465  *      rtnl_lock();
6466  *      ...
6467  *      register_netdevice(x1);
6468  *      register_netdevice(x2);
6469  *      ...
6470  *      unregister_netdevice(y1);
6471  *      unregister_netdevice(y2);
6472  *      ...
6473  *      rtnl_unlock();
6474  *      free_netdev(y1);
6475  *      free_netdev(y2);
6476  *
6477  * We are invoked by rtnl_unlock().
6478  * This allows us to deal with problems:
6479  * 1) We can delete sysfs objects which invoke hotplug
6480  *    without deadlocking with linkwatch via keventd.
6481  * 2) Since we run with the RTNL semaphore not held, we can sleep
6482  *    safely in order to wait for the netdev refcnt to drop to zero.
6483  *
6484  * We must not return until all unregister events added during
6485  * the interval the lock was held have been completed.
6486  */
6487 void netdev_run_todo(void)
6488 {
6489         struct list_head list;
6490
6491         /* Snapshot list, allow later requests */
6492         list_replace_init(&net_todo_list, &list);
6493
6494         __rtnl_unlock();
6495
6496
6497         /* Wait for rcu callbacks to finish before next phase */
6498         if (!list_empty(&list))
6499                 rcu_barrier();
6500
6501         while (!list_empty(&list)) {
6502                 struct net_device *dev
6503                         = list_first_entry(&list, struct net_device, todo_list);
6504                 list_del(&dev->todo_list);
6505
6506                 rtnl_lock();
6507                 call_netdevice_notifiers(NETDEV_UNREGISTER_FINAL, dev);
6508                 __rtnl_unlock();
6509
6510                 if (unlikely(dev->reg_state != NETREG_UNREGISTERING)) {
6511                         pr_err("network todo '%s' but state %d\n",
6512                                dev->name, dev->reg_state);
6513                         dump_stack();
6514                         continue;
6515                 }
6516
6517                 dev->reg_state = NETREG_UNREGISTERED;
6518
6519                 netdev_wait_allrefs(dev);
6520
6521                 /* paranoia */
6522                 BUG_ON(netdev_refcnt_read(dev));
6523                 WARN_ON(rcu_access_pointer(dev->ip_ptr));
6524                 WARN_ON(rcu_access_pointer(dev->ip6_ptr));
6525                 WARN_ON(dev->dn_ptr);
6526
6527                 if (dev->destructor)
6528                         dev->destructor(dev);
6529
6530                 /* Report a network device has been unregistered */
6531                 rtnl_lock();
6532                 dev_net(dev)->dev_unreg_count--;
6533                 __rtnl_unlock();
6534                 wake_up(&netdev_unregistering_wq);
6535
6536                 /* Free network device */
6537                 kobject_put(&dev->dev.kobj);
6538         }
6539 }
6540
6541 /* Convert net_device_stats to rtnl_link_stats64.  They have the same
6542  * fields in the same order, with only the type differing.
6543  */
6544 void netdev_stats_to_stats64(struct rtnl_link_stats64 *stats64,
6545                              const struct net_device_stats *netdev_stats)
6546 {
6547 #if BITS_PER_LONG == 64
6548         BUILD_BUG_ON(sizeof(*stats64) != sizeof(*netdev_stats));
6549         memcpy(stats64, netdev_stats, sizeof(*stats64));
6550 #else
6551         size_t i, n = sizeof(*stats64) / sizeof(u64);
6552         const unsigned long *src = (const unsigned long *)netdev_stats;
6553         u64 *dst = (u64 *)stats64;
6554
6555         BUILD_BUG_ON(sizeof(*netdev_stats) / sizeof(unsigned long) !=
6556                      sizeof(*stats64) / sizeof(u64));
6557         for (i = 0; i < n; i++)
6558                 dst[i] = src[i];
6559 #endif
6560 }
6561 EXPORT_SYMBOL(netdev_stats_to_stats64);
6562
6563 /**
6564  *      dev_get_stats   - get network device statistics
6565  *      @dev: device to get statistics from
6566  *      @storage: place to store stats
6567  *
6568  *      Get network statistics from device. Return @storage.
6569  *      The device driver may provide its own method by setting
6570  *      dev->netdev_ops->get_stats64 or dev->netdev_ops->get_stats;
6571  *      otherwise the internal statistics structure is used.
6572  */
6573 struct rtnl_link_stats64 *dev_get_stats(struct net_device *dev,
6574                                         struct rtnl_link_stats64 *storage)
6575 {
6576         const struct net_device_ops *ops = dev->netdev_ops;
6577
6578         if (ops->ndo_get_stats64) {
6579                 memset(storage, 0, sizeof(*storage));
6580                 ops->ndo_get_stats64(dev, storage);
6581         } else if (ops->ndo_get_stats) {
6582                 netdev_stats_to_stats64(storage, ops->ndo_get_stats(dev));
6583         } else {
6584                 netdev_stats_to_stats64(storage, &dev->stats);
6585         }
6586         storage->rx_dropped += (unsigned long)atomic_long_read(&dev->rx_dropped);
6587         storage->tx_dropped += (unsigned long)atomic_long_read(&dev->tx_dropped);
6588         return storage;
6589 }
6590 EXPORT_SYMBOL(dev_get_stats);
6591
6592 struct netdev_queue *dev_ingress_queue_create(struct net_device *dev)
6593 {
6594         struct netdev_queue *queue = dev_ingress_queue(dev);
6595
6596 #ifdef CONFIG_NET_CLS_ACT
6597         if (queue)
6598                 return queue;
6599         queue = kzalloc(sizeof(*queue), GFP_KERNEL);
6600         if (!queue)
6601                 return NULL;
6602         netdev_init_one_queue(dev, queue, NULL);
6603         queue->qdisc = &noop_qdisc;
6604         queue->qdisc_sleeping = &noop_qdisc;
6605         rcu_assign_pointer(dev->ingress_queue, queue);
6606 #endif
6607         return queue;
6608 }
6609
6610 static const struct ethtool_ops default_ethtool_ops;
6611
6612 void netdev_set_default_ethtool_ops(struct net_device *dev,
6613                                     const struct ethtool_ops *ops)
6614 {
6615         if (dev->ethtool_ops == &default_ethtool_ops)
6616                 dev->ethtool_ops = ops;
6617 }
6618 EXPORT_SYMBOL_GPL(netdev_set_default_ethtool_ops);
6619
6620 void netdev_freemem(struct net_device *dev)
6621 {
6622         char *addr = (char *)dev - dev->padded;
6623
6624         kvfree(addr);
6625 }
6626
6627 /**
6628  *      alloc_netdev_mqs - allocate network device
6629  *      @sizeof_priv:           size of private data to allocate space for
6630  *      @name:                  device name format string
6631  *      @name_assign_type:      origin of device name
6632  *      @setup:                 callback to initialize device
6633  *      @txqs:                  the number of TX subqueues to allocate
6634  *      @rxqs:                  the number of RX subqueues to allocate
6635  *
6636  *      Allocates a struct net_device with private data area for driver use
6637  *      and performs basic initialization.  Also allocates subqueue structs
6638  *      for each queue on the device.
6639  */
6640 struct net_device *alloc_netdev_mqs(int sizeof_priv, const char *name,
6641                 unsigned char name_assign_type,
6642                 void (*setup)(struct net_device *),
6643                 unsigned int txqs, unsigned int rxqs)
6644 {
6645         struct net_device *dev;
6646         size_t alloc_size;
6647         struct net_device *p;
6648
6649         BUG_ON(strlen(name) >= sizeof(dev->name));
6650
6651         if (txqs < 1) {
6652                 pr_err("alloc_netdev: Unable to allocate device with zero queues\n");
6653                 return NULL;
6654         }
6655
6656 #ifdef CONFIG_SYSFS
6657         if (rxqs < 1) {
6658                 pr_err("alloc_netdev: Unable to allocate device with zero RX queues\n");
6659                 return NULL;
6660         }
6661 #endif
6662
6663         alloc_size = sizeof(struct net_device);
6664         if (sizeof_priv) {
6665                 /* ensure 32-byte alignment of private area */
6666                 alloc_size = ALIGN(alloc_size, NETDEV_ALIGN);
6667                 alloc_size += sizeof_priv;
6668         }
6669         /* ensure 32-byte alignment of whole construct */
6670         alloc_size += NETDEV_ALIGN - 1;
6671
6672         p = kzalloc(alloc_size, GFP_KERNEL | __GFP_NOWARN | __GFP_REPEAT);
6673         if (!p)
6674                 p = vzalloc(alloc_size);
6675         if (!p)
6676                 return NULL;
6677
6678         dev = PTR_ALIGN(p, NETDEV_ALIGN);
6679         dev->padded = (char *)dev - (char *)p;
6680
6681         dev->pcpu_refcnt = alloc_percpu(int);
6682         if (!dev->pcpu_refcnt)
6683                 goto free_dev;
6684
6685         if (dev_addr_init(dev))
6686                 goto free_pcpu;
6687
6688         dev_mc_init(dev);
6689         dev_uc_init(dev);
6690
6691         dev_net_set(dev, &init_net);
6692
6693         dev->gso_max_size = GSO_MAX_SIZE;
6694         dev->gso_max_segs = GSO_MAX_SEGS;
6695         dev->gso_min_segs = 0;
6696
6697         INIT_LIST_HEAD(&dev->napi_list);
6698         INIT_LIST_HEAD(&dev->unreg_list);
6699         INIT_LIST_HEAD(&dev->close_list);
6700         INIT_LIST_HEAD(&dev->link_watch_list);
6701         INIT_LIST_HEAD(&dev->adj_list.upper);
6702         INIT_LIST_HEAD(&dev->adj_list.lower);
6703         INIT_LIST_HEAD(&dev->all_adj_list.upper);
6704         INIT_LIST_HEAD(&dev->all_adj_list.lower);
6705         dev->priv_flags = IFF_XMIT_DST_RELEASE | IFF_XMIT_DST_RELEASE_PERM;
6706         setup(dev);
6707
6708         dev->num_tx_queues = txqs;
6709         dev->real_num_tx_queues = txqs;
6710         if (netif_alloc_netdev_queues(dev))
6711                 goto free_all;
6712
6713 #ifdef CONFIG_SYSFS
6714         dev->num_rx_queues = rxqs;
6715         dev->real_num_rx_queues = rxqs;
6716         if (netif_alloc_rx_queues(dev))
6717                 goto free_all;
6718 #endif
6719
6720         strcpy(dev->name, name);
6721         dev->name_assign_type = name_assign_type;
6722         dev->group = INIT_NETDEV_GROUP;
6723         if (!dev->ethtool_ops)
6724                 dev->ethtool_ops = &default_ethtool_ops;
6725         return dev;
6726
6727 free_all:
6728         free_netdev(dev);
6729         return NULL;
6730
6731 free_pcpu:
6732         free_percpu(dev->pcpu_refcnt);
6733 free_dev:
6734         netdev_freemem(dev);
6735         return NULL;
6736 }
6737 EXPORT_SYMBOL(alloc_netdev_mqs);
6738
6739 /**
6740  *      free_netdev - free network device
6741  *      @dev: device
6742  *
6743  *      This function does the last stage of destroying an allocated device
6744  *      interface. The reference to the device object is released.
6745  *      If this is the last reference then it will be freed.
6746  */
6747 void free_netdev(struct net_device *dev)
6748 {
6749         struct napi_struct *p, *n;
6750
6751         release_net(dev_net(dev));
6752
6753         netif_free_tx_queues(dev);
6754 #ifdef CONFIG_SYSFS
6755         kfree(dev->_rx);
6756 #endif
6757
6758         kfree(rcu_dereference_protected(dev->ingress_queue, 1));
6759
6760         /* Flush device addresses */
6761         dev_addr_flush(dev);
6762
6763         list_for_each_entry_safe(p, n, &dev->napi_list, dev_list)
6764                 netif_napi_del(p);
6765
6766         free_percpu(dev->pcpu_refcnt);
6767         dev->pcpu_refcnt = NULL;
6768
6769         /*  Compatibility with error handling in drivers */
6770         if (dev->reg_state == NETREG_UNINITIALIZED) {
6771                 netdev_freemem(dev);
6772                 return;
6773         }
6774
6775         BUG_ON(dev->reg_state != NETREG_UNREGISTERED);
6776         dev->reg_state = NETREG_RELEASED;
6777
6778         /* will free via device release */
6779         put_device(&dev->dev);
6780 }
6781 EXPORT_SYMBOL(free_netdev);
6782
6783 /**
6784  *      synchronize_net -  Synchronize with packet receive processing
6785  *
6786  *      Wait for packets currently being received to be done.
6787  *      Does not block later packets from starting.
6788  */
6789 void synchronize_net(void)
6790 {
6791         might_sleep();
6792         if (rtnl_is_locked())
6793                 synchronize_rcu_expedited();
6794         else
6795                 synchronize_rcu();
6796 }
6797 EXPORT_SYMBOL(synchronize_net);
6798
6799 /**
6800  *      unregister_netdevice_queue - remove device from the kernel
6801  *      @dev: device
6802  *      @head: list
6803  *
6804  *      This function shuts down a device interface and removes it
6805  *      from the kernel tables.
6806  *      If head not NULL, device is queued to be unregistered later.
6807  *
6808  *      Callers must hold the rtnl semaphore.  You may want
6809  *      unregister_netdev() instead of this.
6810  */
6811
6812 void unregister_netdevice_queue(struct net_device *dev, struct list_head *head)
6813 {
6814         ASSERT_RTNL();
6815
6816         if (head) {
6817                 list_move_tail(&dev->unreg_list, head);
6818         } else {
6819                 rollback_registered(dev);
6820                 /* Finish processing unregister after unlock */
6821                 net_set_todo(dev);
6822         }
6823 }
6824 EXPORT_SYMBOL(unregister_netdevice_queue);
6825
6826 /**
6827  *      unregister_netdevice_many - unregister many devices
6828  *      @head: list of devices
6829  *
6830  *  Note: As most callers use a stack allocated list_head,
6831  *  we force a list_del() to make sure stack wont be corrupted later.
6832  */
6833 void unregister_netdevice_many(struct list_head *head)
6834 {
6835         struct net_device *dev;
6836
6837         if (!list_empty(head)) {
6838                 rollback_registered_many(head);
6839                 list_for_each_entry(dev, head, unreg_list)
6840                         net_set_todo(dev);
6841                 list_del(head);
6842         }
6843 }
6844 EXPORT_SYMBOL(unregister_netdevice_many);
6845
6846 /**
6847  *      unregister_netdev - remove device from the kernel
6848  *      @dev: device
6849  *
6850  *      This function shuts down a device interface and removes it
6851  *      from the kernel tables.
6852  *
6853  *      This is just a wrapper for unregister_netdevice that takes
6854  *      the rtnl semaphore.  In general you want to use this and not
6855  *      unregister_netdevice.
6856  */
6857 void unregister_netdev(struct net_device *dev)
6858 {
6859         rtnl_lock();
6860         unregister_netdevice(dev);
6861         rtnl_unlock();
6862 }
6863 EXPORT_SYMBOL(unregister_netdev);
6864
6865 /**
6866  *      dev_change_net_namespace - move device to different nethost namespace
6867  *      @dev: device
6868  *      @net: network namespace
6869  *      @pat: If not NULL name pattern to try if the current device name
6870  *            is already taken in the destination network namespace.
6871  *
6872  *      This function shuts down a device interface and moves it
6873  *      to a new network namespace. On success 0 is returned, on
6874  *      a failure a netagive errno code is returned.
6875  *
6876  *      Callers must hold the rtnl semaphore.
6877  */
6878
6879 int dev_change_net_namespace(struct net_device *dev, struct net *net, const char *pat)
6880 {
6881         int err;
6882
6883         ASSERT_RTNL();
6884
6885         /* Don't allow namespace local devices to be moved. */
6886         err = -EINVAL;
6887         if (dev->features & NETIF_F_NETNS_LOCAL)
6888                 goto out;
6889
6890         /* Ensure the device has been registrered */
6891         if (dev->reg_state != NETREG_REGISTERED)
6892                 goto out;
6893
6894         /* Get out if there is nothing todo */
6895         err = 0;
6896         if (net_eq(dev_net(dev), net))
6897                 goto out;
6898
6899         /* Pick the destination device name, and ensure
6900          * we can use it in the destination network namespace.
6901          */
6902         err = -EEXIST;
6903         if (__dev_get_by_name(net, dev->name)) {
6904                 /* We get here if we can't use the current device name */
6905                 if (!pat)
6906                         goto out;
6907                 if (dev_get_valid_name(net, dev, pat) < 0)
6908                         goto out;
6909         }
6910
6911         /*
6912          * And now a mini version of register_netdevice unregister_netdevice.
6913          */
6914
6915         /* If device is running close it first. */
6916         dev_close(dev);
6917
6918         /* And unlink it from device chain */
6919         err = -ENODEV;
6920         unlist_netdevice(dev);
6921
6922         synchronize_net();
6923
6924         /* Shutdown queueing discipline. */
6925         dev_shutdown(dev);
6926
6927         /* Notify protocols, that we are about to destroy
6928            this device. They should clean all the things.
6929
6930            Note that dev->reg_state stays at NETREG_REGISTERED.
6931            This is wanted because this way 8021q and macvlan know
6932            the device is just moving and can keep their slaves up.
6933         */
6934         call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
6935         rcu_barrier();
6936         call_netdevice_notifiers(NETDEV_UNREGISTER_FINAL, dev);
6937         rtmsg_ifinfo(RTM_DELLINK, dev, ~0U, GFP_KERNEL);
6938
6939         /*
6940          *      Flush the unicast and multicast chains
6941          */
6942         dev_uc_flush(dev);
6943         dev_mc_flush(dev);
6944
6945         /* Send a netdev-removed uevent to the old namespace */
6946         kobject_uevent(&dev->dev.kobj, KOBJ_REMOVE);
6947         netdev_adjacent_del_links(dev);
6948
6949         /* Actually switch the network namespace */
6950         dev_net_set(dev, net);
6951
6952         /* If there is an ifindex conflict assign a new one */
6953         if (__dev_get_by_index(net, dev->ifindex)) {
6954                 int iflink = (dev->iflink == dev->ifindex);
6955                 dev->ifindex = dev_new_index(net);
6956                 if (iflink)
6957                         dev->iflink = dev->ifindex;
6958         }
6959
6960         /* Send a netdev-add uevent to the new namespace */
6961         kobject_uevent(&dev->dev.kobj, KOBJ_ADD);
6962         netdev_adjacent_add_links(dev);
6963
6964         /* Fixup kobjects */
6965         err = device_rename(&dev->dev, dev->name);
6966         WARN_ON(err);
6967
6968         /* Add the device back in the hashes */
6969         list_netdevice(dev);
6970
6971         /* Notify protocols, that a new device appeared. */
6972         call_netdevice_notifiers(NETDEV_REGISTER, dev);
6973
6974         /*
6975          *      Prevent userspace races by waiting until the network
6976          *      device is fully setup before sending notifications.
6977          */
6978         rtmsg_ifinfo(RTM_NEWLINK, dev, ~0U, GFP_KERNEL);
6979
6980         synchronize_net();
6981         err = 0;
6982 out:
6983         return err;
6984 }
6985 EXPORT_SYMBOL_GPL(dev_change_net_namespace);
6986
6987 static int dev_cpu_callback(struct notifier_block *nfb,
6988                             unsigned long action,
6989                             void *ocpu)
6990 {
6991         struct sk_buff **list_skb;
6992         struct sk_buff *skb;
6993         unsigned int cpu, oldcpu = (unsigned long)ocpu;
6994         struct softnet_data *sd, *oldsd;
6995
6996         if (action != CPU_DEAD && action != CPU_DEAD_FROZEN)
6997                 return NOTIFY_OK;
6998
6999         local_irq_disable();
7000         cpu = smp_processor_id();
7001         sd = &per_cpu(softnet_data, cpu);
7002         oldsd = &per_cpu(softnet_data, oldcpu);
7003
7004         /* Find end of our completion_queue. */
7005         list_skb = &sd->completion_queue;
7006         while (*list_skb)
7007                 list_skb = &(*list_skb)->next;
7008         /* Append completion queue from offline CPU. */
7009         *list_skb = oldsd->completion_queue;
7010         oldsd->completion_queue = NULL;
7011
7012         /* Append output queue from offline CPU. */
7013         if (oldsd->output_queue) {
7014                 *sd->output_queue_tailp = oldsd->output_queue;
7015                 sd->output_queue_tailp = oldsd->output_queue_tailp;
7016                 oldsd->output_queue = NULL;
7017                 oldsd->output_queue_tailp = &oldsd->output_queue;
7018         }
7019         /* Append NAPI poll list from offline CPU, with one exception :
7020          * process_backlog() must be called by cpu owning percpu backlog.
7021          * We properly handle process_queue & input_pkt_queue later.
7022          */
7023         while (!list_empty(&oldsd->poll_list)) {
7024                 struct napi_struct *napi = list_first_entry(&oldsd->poll_list,
7025                                                             struct napi_struct,
7026                                                             poll_list);
7027
7028                 list_del_init(&napi->poll_list);
7029                 if (napi->poll == process_backlog)
7030                         napi->state = 0;
7031                 else
7032                         ____napi_schedule(sd, napi);
7033         }
7034
7035         raise_softirq_irqoff(NET_TX_SOFTIRQ);
7036         local_irq_enable();
7037
7038         /* Process offline CPU's input_pkt_queue */
7039         while ((skb = __skb_dequeue(&oldsd->process_queue))) {
7040                 netif_rx_internal(skb);
7041                 input_queue_head_incr(oldsd);
7042         }
7043         while ((skb = skb_dequeue(&oldsd->input_pkt_queue))) {
7044                 netif_rx_internal(skb);
7045                 input_queue_head_incr(oldsd);
7046         }
7047
7048         return NOTIFY_OK;
7049 }
7050
7051
7052 /**
7053  *      netdev_increment_features - increment feature set by one
7054  *      @all: current feature set
7055  *      @one: new feature set
7056  *      @mask: mask feature set
7057  *
7058  *      Computes a new feature set after adding a device with feature set
7059  *      @one to the master device with current feature set @all.  Will not
7060  *      enable anything that is off in @mask. Returns the new feature set.
7061  */
7062 netdev_features_t netdev_increment_features(netdev_features_t all,
7063         netdev_features_t one, netdev_features_t mask)
7064 {
7065         if (mask & NETIF_F_GEN_CSUM)
7066                 mask |= NETIF_F_ALL_CSUM;
7067         mask |= NETIF_F_VLAN_CHALLENGED;
7068
7069         all |= one & (NETIF_F_ONE_FOR_ALL|NETIF_F_ALL_CSUM) & mask;
7070         all &= one | ~NETIF_F_ALL_FOR_ALL;
7071
7072         /* If one device supports hw checksumming, set for all. */
7073         if (all & NETIF_F_GEN_CSUM)
7074                 all &= ~(NETIF_F_ALL_CSUM & ~NETIF_F_GEN_CSUM);
7075
7076         return all;
7077 }
7078 EXPORT_SYMBOL(netdev_increment_features);
7079
7080 static struct hlist_head * __net_init netdev_create_hash(void)
7081 {
7082         int i;
7083         struct hlist_head *hash;
7084
7085         hash = kmalloc(sizeof(*hash) * NETDEV_HASHENTRIES, GFP_KERNEL);
7086         if (hash != NULL)
7087                 for (i = 0; i < NETDEV_HASHENTRIES; i++)
7088                         INIT_HLIST_HEAD(&hash[i]);
7089
7090         return hash;
7091 }
7092
7093 /* Initialize per network namespace state */
7094 static int __net_init netdev_init(struct net *net)
7095 {
7096         if (net != &init_net)
7097                 INIT_LIST_HEAD(&net->dev_base_head);
7098
7099         net->dev_name_head = netdev_create_hash();
7100         if (net->dev_name_head == NULL)
7101                 goto err_name;
7102
7103         net->dev_index_head = netdev_create_hash();
7104         if (net->dev_index_head == NULL)
7105                 goto err_idx;
7106
7107         return 0;
7108
7109 err_idx:
7110         kfree(net->dev_name_head);
7111 err_name:
7112         return -ENOMEM;
7113 }
7114
7115 /**
7116  *      netdev_drivername - network driver for the device
7117  *      @dev: network device
7118  *
7119  *      Determine network driver for device.
7120  */
7121 const char *netdev_drivername(const struct net_device *dev)
7122 {
7123         const struct device_driver *driver;
7124         const struct device *parent;
7125         const char *empty = "";
7126
7127         parent = dev->dev.parent;
7128         if (!parent)
7129                 return empty;
7130
7131         driver = parent->driver;
7132         if (driver && driver->name)
7133                 return driver->name;
7134         return empty;
7135 }
7136
7137 static void __netdev_printk(const char *level, const struct net_device *dev,
7138                             struct va_format *vaf)
7139 {
7140         if (dev && dev->dev.parent) {
7141                 dev_printk_emit(level[1] - '0',
7142                                 dev->dev.parent,
7143                                 "%s %s %s%s: %pV",
7144                                 dev_driver_string(dev->dev.parent),
7145                                 dev_name(dev->dev.parent),
7146                                 netdev_name(dev), netdev_reg_state(dev),
7147                                 vaf);
7148         } else if (dev) {
7149                 printk("%s%s%s: %pV",
7150                        level, netdev_name(dev), netdev_reg_state(dev), vaf);
7151         } else {
7152                 printk("%s(NULL net_device): %pV", level, vaf);
7153         }
7154 }
7155
7156 void netdev_printk(const char *level, const struct net_device *dev,
7157                    const char *format, ...)
7158 {
7159         struct va_format vaf;
7160         va_list args;
7161
7162         va_start(args, format);
7163
7164         vaf.fmt = format;
7165         vaf.va = &args;
7166
7167         __netdev_printk(level, dev, &vaf);
7168
7169         va_end(args);
7170 }
7171 EXPORT_SYMBOL(netdev_printk);
7172
7173 #define define_netdev_printk_level(func, level)                 \
7174 void func(const struct net_device *dev, const char *fmt, ...)   \
7175 {                                                               \
7176         struct va_format vaf;                                   \
7177         va_list args;                                           \
7178                                                                 \
7179         va_start(args, fmt);                                    \
7180                                                                 \
7181         vaf.fmt = fmt;                                          \
7182         vaf.va = &args;                                         \
7183                                                                 \
7184         __netdev_printk(level, dev, &vaf);                      \
7185                                                                 \
7186         va_end(args);                                           \
7187 }                                                               \
7188 EXPORT_SYMBOL(func);
7189
7190 define_netdev_printk_level(netdev_emerg, KERN_EMERG);
7191 define_netdev_printk_level(netdev_alert, KERN_ALERT);
7192 define_netdev_printk_level(netdev_crit, KERN_CRIT);
7193 define_netdev_printk_level(netdev_err, KERN_ERR);
7194 define_netdev_printk_level(netdev_warn, KERN_WARNING);
7195 define_netdev_printk_level(netdev_notice, KERN_NOTICE);
7196 define_netdev_printk_level(netdev_info, KERN_INFO);
7197
7198 static void __net_exit netdev_exit(struct net *net)
7199 {
7200         kfree(net->dev_name_head);
7201         kfree(net->dev_index_head);
7202 }
7203
7204 static struct pernet_operations __net_initdata netdev_net_ops = {
7205         .init = netdev_init,
7206         .exit = netdev_exit,
7207 };
7208
7209 static void __net_exit default_device_exit(struct net *net)
7210 {
7211         struct net_device *dev, *aux;
7212         /*
7213          * Push all migratable network devices back to the
7214          * initial network namespace
7215          */
7216         rtnl_lock();
7217         for_each_netdev_safe(net, dev, aux) {
7218                 int err;
7219                 char fb_name[IFNAMSIZ];
7220
7221                 /* Ignore unmoveable devices (i.e. loopback) */
7222                 if (dev->features & NETIF_F_NETNS_LOCAL)
7223                         continue;
7224
7225                 /* Leave virtual devices for the generic cleanup */
7226                 if (dev->rtnl_link_ops)
7227                         continue;
7228
7229                 /* Push remaining network devices to init_net */
7230                 snprintf(fb_name, IFNAMSIZ, "dev%d", dev->ifindex);
7231                 err = dev_change_net_namespace(dev, &init_net, fb_name);
7232                 if (err) {
7233                         pr_emerg("%s: failed to move %s to init_net: %d\n",
7234                                  __func__, dev->name, err);
7235                         BUG();
7236                 }
7237         }
7238         rtnl_unlock();
7239 }
7240
7241 static void __net_exit rtnl_lock_unregistering(struct list_head *net_list)
7242 {
7243         /* Return with the rtnl_lock held when there are no network
7244          * devices unregistering in any network namespace in net_list.
7245          */
7246         struct net *net;
7247         bool unregistering;
7248         DEFINE_WAIT(wait);
7249
7250         for (;;) {
7251                 prepare_to_wait(&netdev_unregistering_wq, &wait,
7252                                 TASK_UNINTERRUPTIBLE);
7253                 unregistering = false;
7254                 rtnl_lock();
7255                 list_for_each_entry(net, net_list, exit_list) {
7256                         if (net->dev_unreg_count > 0) {
7257                                 unregistering = true;
7258                                 break;
7259                         }
7260                 }
7261                 if (!unregistering)
7262                         break;
7263                 __rtnl_unlock();
7264                 schedule();
7265         }
7266         finish_wait(&netdev_unregistering_wq, &wait);
7267 }
7268
7269 static void __net_exit default_device_exit_batch(struct list_head *net_list)
7270 {
7271         /* At exit all network devices most be removed from a network
7272          * namespace.  Do this in the reverse order of registration.
7273          * Do this across as many network namespaces as possible to
7274          * improve batching efficiency.
7275          */
7276         struct net_device *dev;
7277         struct net *net;
7278         LIST_HEAD(dev_kill_list);
7279
7280         /* To prevent network device cleanup code from dereferencing
7281          * loopback devices or network devices that have been freed
7282          * wait here for all pending unregistrations to complete,
7283          * before unregistring the loopback device and allowing the
7284          * network namespace be freed.
7285          *
7286          * The netdev todo list containing all network devices
7287          * unregistrations that happen in default_device_exit_batch
7288          * will run in the rtnl_unlock() at the end of
7289          * default_device_exit_batch.
7290          */
7291         rtnl_lock_unregistering(net_list);
7292         list_for_each_entry(net, net_list, exit_list) {
7293                 for_each_netdev_reverse(net, dev) {
7294                         if (dev->rtnl_link_ops && dev->rtnl_link_ops->dellink)
7295                                 dev->rtnl_link_ops->dellink(dev, &dev_kill_list);
7296                         else
7297                                 unregister_netdevice_queue(dev, &dev_kill_list);
7298                 }
7299         }
7300         unregister_netdevice_many(&dev_kill_list);
7301         rtnl_unlock();
7302 }
7303
7304 static struct pernet_operations __net_initdata default_device_ops = {
7305         .exit = default_device_exit,
7306         .exit_batch = default_device_exit_batch,
7307 };
7308
7309 /*
7310  *      Initialize the DEV module. At boot time this walks the device list and
7311  *      unhooks any devices that fail to initialise (normally hardware not
7312  *      present) and leaves us with a valid list of present and active devices.
7313  *
7314  */
7315
7316 /*
7317  *       This is called single threaded during boot, so no need
7318  *       to take the rtnl semaphore.
7319  */
7320 static int __init net_dev_init(void)
7321 {
7322         int i, rc = -ENOMEM;
7323
7324         BUG_ON(!dev_boot_phase);
7325
7326         if (dev_proc_init())
7327                 goto out;
7328
7329         if (netdev_kobject_init())
7330                 goto out;
7331
7332         INIT_LIST_HEAD(&ptype_all);
7333         for (i = 0; i < PTYPE_HASH_SIZE; i++)
7334                 INIT_LIST_HEAD(&ptype_base[i]);
7335
7336         INIT_LIST_HEAD(&offload_base);
7337
7338         if (register_pernet_subsys(&netdev_net_ops))
7339                 goto out;
7340
7341         /*
7342          *      Initialise the packet receive queues.
7343          */
7344
7345         for_each_possible_cpu(i) {
7346                 struct softnet_data *sd = &per_cpu(softnet_data, i);
7347
7348                 skb_queue_head_init(&sd->input_pkt_queue);
7349                 skb_queue_head_init(&sd->process_queue);
7350                 INIT_LIST_HEAD(&sd->poll_list);
7351                 sd->output_queue_tailp = &sd->output_queue;
7352 #ifdef CONFIG_RPS
7353                 sd->csd.func = rps_trigger_softirq;
7354                 sd->csd.info = sd;
7355                 sd->cpu = i;
7356 #endif
7357
7358                 sd->backlog.poll = process_backlog;
7359                 sd->backlog.weight = weight_p;
7360         }
7361
7362         dev_boot_phase = 0;
7363
7364         /* The loopback device is special if any other network devices
7365          * is present in a network namespace the loopback device must
7366          * be present. Since we now dynamically allocate and free the
7367          * loopback device ensure this invariant is maintained by
7368          * keeping the loopback device as the first device on the
7369          * list of network devices.  Ensuring the loopback devices
7370          * is the first device that appears and the last network device
7371          * that disappears.
7372          */
7373         if (register_pernet_device(&loopback_net_ops))
7374                 goto out;
7375
7376         if (register_pernet_device(&default_device_ops))
7377                 goto out;
7378
7379         open_softirq(NET_TX_SOFTIRQ, net_tx_action);
7380         open_softirq(NET_RX_SOFTIRQ, net_rx_action);
7381
7382         hotcpu_notifier(dev_cpu_callback, 0);
7383         dst_init();
7384         rc = 0;
7385 out:
7386         return rc;
7387 }
7388
7389 subsys_initcall(net_dev_init);