net/core/dev.c

   1 /*
   2  *      NET3    Protocol independent device support routines.
   3  *
   4  *              This program is free software; you can redistribute it and/or
   5  *              modify it under the terms of the GNU General Public License
   6  *              as published by the Free Software Foundation; either version
   7  *              2 of the License, or (at your option) any later version.
   8  *
   9  *      Derived from the non IP parts of dev.c 1.0.19
  10  *              Authors:        Ross Biro
  11  *                              Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
  12  *                              Mark Evans, <evansmp@uhura.aston.ac.uk>
  13  *
  14  *      Additional Authors:
  15  *              Florian la Roche <rzsfl@rz.uni-sb.de>
  16  *              Alan Cox <gw4pts@gw4pts.ampr.org>
  17  *              David Hinds <dahinds@users.sourceforge.net>
  18  *              Alexey Kuznetsov <kuznet@ms2.inr.ac.ru>
  19  *              Adam Sulmicki <adam@cfar.umd.edu>
  20  *              Pekka Riikonen <priikone@poesidon.pspt.fi>
  21  *
  22  *      Changes:
  23  *              D.J. Barrow     :       Fixed bug where dev->refcnt gets set
  24  *                                      to 2 if register_netdev gets called
  25  *                                      before net_dev_init & also removed a
  26  *                                      few lines of code in the process.
  27  *              Alan Cox        :       device private ioctl copies fields back.
  28  *              Alan Cox        :       Transmit queue code does relevant
  29  *                                      stunts to keep the queue safe.
  30  *              Alan Cox        :       Fixed double lock.
  31  *              Alan Cox        :       Fixed promisc NULL pointer trap
  32  *              ????????        :       Support the full private ioctl range
  33  *              Alan Cox        :       Moved ioctl permission check into
  34  *                                      drivers
  35  *              Tim Kordas      :       SIOCADDMULTI/SIOCDELMULTI
  36  *              Alan Cox        :       100 backlog just doesn't cut it when
  37  *                                      you start doing multicast video 8)
  38  *              Alan Cox        :       Rewrote net_bh and list manager.
  39  *              Alan Cox        :       Fix ETH_P_ALL echoback lengths.
  40  *              Alan Cox        :       Took out transmit every packet pass
  41  *                                      Saved a few bytes in the ioctl handler
  42  *              Alan Cox        :       Network driver sets packet type before
  43  *                                      calling netif_rx. Saves a function
  44  *                                      call a packet.
  45  *              Alan Cox        :       Hashed net_bh()
  46  *              Richard Kooijman:       Timestamp fixes.
  47  *              Alan Cox        :       Wrong field in SIOCGIFDSTADDR
  48  *              Alan Cox        :       Device lock protection.
  49  *              Alan Cox        :       Fixed nasty side effect of device close
  50  *                                      changes.
  51  *              Rudi Cilibrasi  :       Pass the right thing to
  52  *                                      set_mac_address()
  53  *              Dave Miller     :       32bit quantity for the device lock to
  54  *                                      make it work out on a Sparc.
  55  *              Bjorn Ekwall    :       Added KERNELD hack.
  56  *              Alan Cox        :       Cleaned up the backlog initialise.
  57  *              Craig Metz      :       SIOCGIFCONF fix if space for under
  58  *                                      1 device.
  59  *          Thomas Bogendoerfer :       Return ENODEV for dev_open, if there
  60  *                                      is no device open function.
  61  *              Andi Kleen      :       Fix error reporting for SIOCGIFCONF
  62  *          Michael Chastain    :       Fix signed/unsigned for SIOCGIFCONF
  63  *              Cyrus Durgin    :       Cleaned for KMOD
  64  *              Adam Sulmicki   :       Bug Fix : Network Device Unload
  65  *                                      A network device unload needs to purge
  66  *                                      the backlog queue.
  67  *      Paul Rusty Russell      :       SIOCSIFNAME
  68  *              Pekka Riikonen  :       Netdev boot-time settings code
  69  *              Andrew Morton   :       Make unregister_netdevice wait
  70  *                                      indefinitely on dev->refcnt
  71  *              J Hadi Salim    :       - Backlog queue sampling
  72  *                                      - netif_rx() feedback
  73  */
  74
  75 #include <asm/uaccess.h>
  76 #include <linux/bitops.h>
  77 #include <linux/capability.h>
  78 #include <linux/cpu.h>
  79 #include <linux/types.h>
  80 #include <linux/kernel.h>
  81 #include <linux/hash.h>
  82 #include <linux/slab.h>
  83 #include <linux/sched.h>
  84 #include <linux/mutex.h>
  85 #include <linux/string.h>
  86 #include <linux/mm.h>
  87 #include <linux/socket.h>
  88 #include <linux/sockios.h>
  89 #include <linux/errno.h>
  90 #include <linux/interrupt.h>
  91 #include <linux/if_ether.h>
  92 #include <linux/netdevice.h>
  93 #include <linux/etherdevice.h>
  94 #include <linux/ethtool.h>
  95 #include <linux/notifier.h>
  96 #include <linux/skbuff.h>
  97 #include <net/net_namespace.h>
  98 #include <net/sock.h>
  99 #include <linux/rtnetlink.h>
 100 #include <linux/stat.h>
 101 #include <net/dst.h>
 102 #include <net/pkt_sched.h>
 103 #include <net/checksum.h>
 104 #include <net/xfrm.h>
 105 #include <linux/highmem.h>
 106 #include <linux/init.h>
 107 #include <linux/module.h>
 108 #include <linux/netpoll.h>
 109 #include <linux/rcupdate.h>
 110 #include <linux/delay.h>
 111 #include <net/iw_handler.h>
 112 #include <asm/current.h>
 113 #include <linux/audit.h>
 114 #include <linux/dmaengine.h>
 115 #include <linux/err.h>
 116 #include <linux/ctype.h>
 117 #include <linux/if_arp.h>
 118 #include <linux/if_vlan.h>
 119 #include <linux/ip.h>
 120 #include <net/ip.h>
 121 #include <linux/ipv6.h>
 122 #include <linux/in.h>
 123 #include <linux/jhash.h>
 124 #include <linux/random.h>
 125 #include <trace/events/napi.h>
 126 #include <trace/events/net.h>
 127 #include <trace/events/skb.h>
 128 #include <linux/pci.h>
 129 #include <linux/inetdevice.h>
 130 #include <linux/cpu_rmap.h>
 131 #include <linux/static_key.h>
 132 #include <linux/hashtable.h>
 133 #include <linux/vmalloc.h>
 134 #include <linux/if_macvlan.h>
 135
 136 #include "net-sysfs.h"
 137
 138 /* Instead of increasing this, you should create a hash table. */
 139 #define MAX_GRO_SKBS 8
 140
 141 /* This should be increased if a protocol with a bigger head is added. */
 142 #define GRO_MAX_HEAD (MAX_HEADER + 128)
 143
 144 static DEFINE_SPINLOCK(ptype_lock);
 145 static DEFINE_SPINLOCK(offload_lock);
 146 struct list_head ptype_base[PTYPE_HASH_SIZE] __read_mostly;
 147 struct list_head ptype_all __read_mostly;       /* Taps */
 148 static struct list_head offload_base __read_mostly;
 149
 150 static int netif_rx_internal(struct sk_buff *skb);
 151 static int call_netdevice_notifiers_info(unsigned long val,
 152                                          struct net_device *dev,
 153                                          struct netdev_notifier_info *info);
 154
 155 /*
 156  * The @dev_base_head list is protected by @dev_base_lock and the rtnl
 157  * semaphore.
 158  *
 159  * Pure readers hold dev_base_lock for reading, or rcu_read_lock()
 160  *
 161  * Writers must hold the rtnl semaphore while they loop through the
 162  * dev_base_head list, and hold dev_base_lock for writing when they do the
 163  * actual updates.  This allows pure readers to access the list even
 164  * while a writer is preparing to update it.
 165  *
 166  * To put it another way, dev_base_lock is held for writing only to
 167  * protect against pure readers; the rtnl semaphore provides the
 168  * protection against other writers.
 169  *
 170  * See, for example usages, register_netdevice() and
 171  * unregister_netdevice(), which must be called with the rtnl
 172  * semaphore held.
 173  */
 174 DEFINE_RWLOCK(dev_base_lock);
 175 EXPORT_SYMBOL(dev_base_lock);
 176
 177 /* protects napi_hash addition/deletion and napi_gen_id */
 178 static DEFINE_SPINLOCK(napi_hash_lock);
 179
 180 static unsigned int napi_gen_id;
 181 static DEFINE_HASHTABLE(napi_hash, 8);
 182
 183 static seqcount_t devnet_rename_seq;
 184
 185 static inline void dev_base_seq_inc(struct net *net)
 186 {
 187         while (++net->dev_base_seq == 0);
 188 }
 189
 190 static inline struct hlist_head *dev_name_hash(struct net *net, const char *name)
 191 {
 192         unsigned int hash = full_name_hash(name, strnlen(name, IFNAMSIZ));
 193
 194         return &net->dev_name_head[hash_32(hash, NETDEV_HASHBITS)];
 195 }
 196
 197 static inline struct hlist_head *dev_index_hash(struct net *net, int ifindex)
 198 {
 199         return &net->dev_index_head[ifindex & (NETDEV_HASHENTRIES - 1)];
 200 }
 201
 202 static inline void rps_lock(struct softnet_data *sd)
 203 {
 204 #ifdef CONFIG_RPS
 205         spin_lock(&sd->input_pkt_queue.lock);
 206 #endif
 207 }
 208
 209 static inline void rps_unlock(struct softnet_data *sd)
 210 {
 211 #ifdef CONFIG_RPS
 212         spin_unlock(&sd->input_pkt_queue.lock);
 213 #endif
 214 }
 215
 216 /* Device list insertion */
 217 static void list_netdevice(struct net_device *dev)
 218 {
 219         struct net *net = dev_net(dev);
 220
 221         ASSERT_RTNL();
 222
 223         write_lock_bh(&dev_base_lock);
 224         list_add_tail_rcu(&dev->dev_list, &net->dev_base_head);
 225         hlist_add_head_rcu(&dev->name_hlist, dev_name_hash(net, dev->name));
 226         hlist_add_head_rcu(&dev->index_hlist,
 227                            dev_index_hash(net, dev->ifindex));
 228         write_unlock_bh(&dev_base_lock);
 229
 230         dev_base_seq_inc(net);
 231 }
 232
 233 /* Device list removal
 234  * caller must respect a RCU grace period before freeing/reusing dev
 235  */
 236 static void unlist_netdevice(struct net_device *dev)
 237 {
 238         ASSERT_RTNL();
 239
 240         /* Unlink dev from the device chain */
 241         write_lock_bh(&dev_base_lock);
 242         list_del_rcu(&dev->dev_list);
 243         hlist_del_rcu(&dev->name_hlist);
 244         hlist_del_rcu(&dev->index_hlist);
 245         write_unlock_bh(&dev_base_lock);
 246
 247         dev_base_seq_inc(dev_net(dev));
 248 }
 249
 250 /*
 251  *      Our notifier list
 252  */
 253
 254 static RAW_NOTIFIER_HEAD(netdev_chain);
 255
 256 /*
 257  *      Device drivers call our routines to queue packets here. We empty the
 258  *      queue in the local softnet handler.
 259  */
 260
 261 DEFINE_PER_CPU_ALIGNED(struct softnet_data, softnet_data);
 262 EXPORT_PER_CPU_SYMBOL(softnet_data);
 263
 264 #ifdef CONFIG_LOCKDEP
 265 /*
 266  * register_netdevice() inits txq->_xmit_lock and sets lockdep class
 267  * according to dev->type
 268  */
 269 static const unsigned short netdev_lock_type[] =
 270         {ARPHRD_NETROM, ARPHRD_ETHER, ARPHRD_EETHER, ARPHRD_AX25,
 271          ARPHRD_PRONET, ARPHRD_CHAOS, ARPHRD_IEEE802, ARPHRD_ARCNET,
 272          ARPHRD_APPLETLK, ARPHRD_DLCI, ARPHRD_ATM, ARPHRD_METRICOM,
 273          ARPHRD_IEEE1394, ARPHRD_EUI64, ARPHRD_INFINIBAND, ARPHRD_SLIP,
 274          ARPHRD_CSLIP, ARPHRD_SLIP6, ARPHRD_CSLIP6, ARPHRD_RSRVD,
 275          ARPHRD_ADAPT, ARPHRD_ROSE, ARPHRD_X25, ARPHRD_HWX25,
 276          ARPHRD_PPP, ARPHRD_CISCO, ARPHRD_LAPB, ARPHRD_DDCMP,
 277          ARPHRD_RAWHDLC, ARPHRD_TUNNEL, ARPHRD_TUNNEL6, ARPHRD_FRAD,
 278          ARPHRD_SKIP, ARPHRD_LOOPBACK, ARPHRD_LOCALTLK, ARPHRD_FDDI,
 279          ARPHRD_BIF, ARPHRD_SIT, ARPHRD_IPDDP, ARPHRD_IPGRE,
 280          ARPHRD_PIMREG, ARPHRD_HIPPI, ARPHRD_ASH, ARPHRD_ECONET,
 281          ARPHRD_IRDA, ARPHRD_FCPP, ARPHRD_FCAL, ARPHRD_FCPL,
 282          ARPHRD_FCFABRIC, ARPHRD_IEEE80211, ARPHRD_IEEE80211_PRISM,
 283          ARPHRD_IEEE80211_RADIOTAP, ARPHRD_PHONET, ARPHRD_PHONET_PIPE,
 284          ARPHRD_IEEE802154, ARPHRD_VOID, ARPHRD_NONE};
 285
 286 static const char *const netdev_lock_name[] =
 287         {"_xmit_NETROM", "_xmit_ETHER", "_xmit_EETHER", "_xmit_AX25",
 288          "_xmit_PRONET", "_xmit_CHAOS", "_xmit_IEEE802", "_xmit_ARCNET",
 289          "_xmit_APPLETLK", "_xmit_DLCI", "_xmit_ATM", "_xmit_METRICOM",
 290          "_xmit_IEEE1394", "_xmit_EUI64", "_xmit_INFINIBAND", "_xmit_SLIP",
 291          "_xmit_CSLIP", "_xmit_SLIP6", "_xmit_CSLIP6", "_xmit_RSRVD",
 292          "_xmit_ADAPT", "_xmit_ROSE", "_xmit_X25", "_xmit_HWX25",
 293          "_xmit_PPP", "_xmit_CISCO", "_xmit_LAPB", "_xmit_DDCMP",
 294          "_xmit_RAWHDLC", "_xmit_TUNNEL", "_xmit_TUNNEL6", "_xmit_FRAD",
 295          "_xmit_SKIP", "_xmit_LOOPBACK", "_xmit_LOCALTLK", "_xmit_FDDI",
 296          "_xmit_BIF", "_xmit_SIT", "_xmit_IPDDP", "_xmit_IPGRE",
 297          "_xmit_PIMREG", "_xmit_HIPPI", "_xmit_ASH", "_xmit_ECONET",
 298          "_xmit_IRDA", "_xmit_FCPP", "_xmit_FCAL", "_xmit_FCPL",
 299          "_xmit_FCFABRIC", "_xmit_IEEE80211", "_xmit_IEEE80211_PRISM",
 300          "_xmit_IEEE80211_RADIOTAP", "_xmit_PHONET", "_xmit_PHONET_PIPE",
 301          "_xmit_IEEE802154", "_xmit_VOID", "_xmit_NONE"};
 302
 303 static struct lock_class_key netdev_xmit_lock_key[ARRAY_SIZE(netdev_lock_type)];
 304 static struct lock_class_key netdev_addr_lock_key[ARRAY_SIZE(netdev_lock_type)];
 305
 306 static inline unsigned short netdev_lock_pos(unsigned short dev_type)
 307 {
 308         int i;
 309
 310         for (i = 0; i < ARRAY_SIZE(netdev_lock_type); i++)
 311                 if (netdev_lock_type[i] == dev_type)
 312                         return i;
 313         /* the last key is used by default */
 314         return ARRAY_SIZE(netdev_lock_type) - 1;
 315 }
 316
 317 static inline void netdev_set_xmit_lockdep_class(spinlock_t *lock,
 318                                                  unsigned short dev_type)
 319 {
 320         int i;
 321
 322         i = netdev_lock_pos(dev_type);
 323         lockdep_set_class_and_name(lock, &netdev_xmit_lock_key[i],
 324                                    netdev_lock_name[i]);
 325 }
 326
 327 static inline void netdev_set_addr_lockdep_class(struct net_device *dev)
 328 {
 329         int i;
 330
 331         i = netdev_lock_pos(dev->type);
 332         lockdep_set_class_and_name(&dev->addr_list_lock,
 333                                    &netdev_addr_lock_key[i],
 334                                    netdev_lock_name[i]);
 335 }
 336 #else
 337 static inline void netdev_set_xmit_lockdep_class(spinlock_t *lock,
 338                                                  unsigned short dev_type)
 339 {
 340 }
 341 static inline void netdev_set_addr_lockdep_class(struct net_device *dev)
 342 {
 343 }
 344 #endif
 345
 346 /*******************************************************************************
 347
 348                 Protocol management and registration routines
 349
 350 *******************************************************************************/
 351
 352 /*
 353  *      Add a protocol ID to the list. Now that the input handler is
 354  *      smarter we can dispense with all the messy stuff that used to be
 355  *      here.
 356  *
 357  *      BEWARE!!! Protocol handlers, mangling input packets,
 358  *      MUST BE last in hash buckets and checking protocol handlers
 359  *      MUST start from promiscuous ptype_all chain in net_bh.
 360  *      It is true now, do not change it.
 361  *      Explanation follows: if protocol handler, mangling packet, will
 362  *      be the first on list, it is not able to sense, that packet
 363  *      is cloned and should be copied-on-write, so that it will
 364  *      change it and subsequent readers will get broken packet.
 365  *                                                      --ANK (980803)
 366  */
 367
 368 static inline struct list_head *ptype_head(const struct packet_type *pt)
 369 {
 370         if (pt->type == htons(ETH_P_ALL))
 371                 return &ptype_all;
 372         else
 373                 return &ptype_base[ntohs(pt->type) & PTYPE_HASH_MASK];
 374 }
 375
 376 /**
 377  *      dev_add_pack - add packet handler
 378  *      @pt: packet type declaration
 379  *
 380  *      Add a protocol handler to the networking stack. The passed &packet_type
 381  *      is linked into kernel lists and may not be freed until it has been
 382  *      removed from the kernel lists.
 383  *
 384  *      This call does not sleep therefore it can not
 385  *      guarantee all CPU's that are in middle of receiving packets
 386  *      will see the new packet type (until the next received packet).
 387  */
 388
 389 void dev_add_pack(struct packet_type *pt)
 390 {
 391         struct list_head *head = ptype_head(pt);
 392
 393         spin_lock(&ptype_lock);
 394         list_add_rcu(&pt->list, head);
 395         spin_unlock(&ptype_lock);
 396 }
 397 EXPORT_SYMBOL(dev_add_pack);
 398
 399 /**
 400  *      __dev_remove_pack        - remove packet handler
 401  *      @pt: packet type declaration
 402  *
 403  *      Remove a protocol handler that was previously added to the kernel
 404  *      protocol handlers by dev_add_pack(). The passed &packet_type is removed
 405  *      from the kernel lists and can be freed or reused once this function
 406  *      returns.
 407  *
 408  *      The packet type might still be in use by receivers
 409  *      and must not be freed until after all the CPU's have gone
 410  *      through a quiescent state.
 411  */
 412 void __dev_remove_pack(struct packet_type *pt)
 413 {
 414         struct list_head *head = ptype_head(pt);
 415         struct packet_type *pt1;
 416
 417         spin_lock(&ptype_lock);
 418
 419         list_for_each_entry(pt1, head, list) {
 420                 if (pt == pt1) {
 421                         list_del_rcu(&pt->list);
 422                         goto out;
 423                 }
 424         }
 425
 426         pr_warn("dev_remove_pack: %p not found\n", pt);
 427 out:
 428         spin_unlock(&ptype_lock);
 429 }
 430 EXPORT_SYMBOL(__dev_remove_pack);
 431
 432 /**
 433  *      dev_remove_pack  - remove packet handler
 434  *      @pt: packet type declaration
 435  *
 436  *      Remove a protocol handler that was previously added to the kernel
 437  *      protocol handlers by dev_add_pack(). The passed &packet_type is removed
 438  *      from the kernel lists and can be freed or reused once this function
 439  *      returns.
 440  *
 441  *      This call sleeps to guarantee that no CPU is looking at the packet
 442  *      type after return.
 443  */
 444 void dev_remove_pack(struct packet_type *pt)
 445 {
 446         __dev_remove_pack(pt);
 447
 448         synchronize_net();
 449 }
 450 EXPORT_SYMBOL(dev_remove_pack);
 451
 452
 453 /**
 454  *      dev_add_offload - register offload handlers
 455  *      @po: protocol offload declaration
 456  *
 457  *      Add protocol offload handlers to the networking stack. The passed
 458  *      &proto_offload is linked into kernel lists and may not be freed until
 459  *      it has been removed from the kernel lists.
 460  *
 461  *      This call does not sleep therefore it can not
 462  *      guarantee all CPU's that are in middle of receiving packets
 463  *      will see the new offload handlers (until the next received packet).
 464  */
 465 void dev_add_offload(struct packet_offload *po)
 466 {
 467         struct list_head *head = &offload_base;
 468
 469         spin_lock(&offload_lock);
 470         list_add_rcu(&po->list, head);
 471         spin_unlock(&offload_lock);
 472 }
 473 EXPORT_SYMBOL(dev_add_offload);
 474
 475 /**
 476  *      __dev_remove_offload     - remove offload handler
 477  *      @po: packet offload declaration
 478  *
 479  *      Remove a protocol offload handler that was previously added to the
 480  *      kernel offload handlers by dev_add_offload(). The passed &offload_type
 481  *      is removed from the kernel lists and can be freed or reused once this
 482  *      function returns.
 483  *
 484  *      The packet type might still be in use by receivers
 485  *      and must not be freed until after all the CPU's have gone
 486  *      through a quiescent state.
 487  */
 488 static void __dev_remove_offload(struct packet_offload *po)
 489 {
 490         struct list_head *head = &offload_base;
 491         struct packet_offload *po1;
 492
 493         spin_lock(&offload_lock);
 494
 495         list_for_each_entry(po1, head, list) {
 496                 if (po == po1) {
 497                         list_del_rcu(&po->list);
 498                         goto out;
 499                 }
 500         }
 501
 502         pr_warn("dev_remove_offload: %p not found\n", po);
 503 out:
 504         spin_unlock(&offload_lock);
 505 }
 506
 507 /**
 508  *      dev_remove_offload       - remove packet offload handler
 509  *      @po: packet offload declaration
 510  *
 511  *      Remove a packet offload handler that was previously added to the kernel
 512  *      offload handlers by dev_add_offload(). The passed &offload_type is
 513  *      removed from the kernel lists and can be freed or reused once this
 514  *      function returns.
 515  *
 516  *      This call sleeps to guarantee that no CPU is looking at the packet
 517  *      type after return.
 518  */
 519 void dev_remove_offload(struct packet_offload *po)
 520 {
 521         __dev_remove_offload(po);
 522
 523         synchronize_net();
 524 }
 525 EXPORT_SYMBOL(dev_remove_offload);
 526
 527 /******************************************************************************
 528
 529                       Device Boot-time Settings Routines
 530
 531 *******************************************************************************/
 532
 533 /* Boot time configuration table */
 534 static struct netdev_boot_setup dev_boot_setup[NETDEV_BOOT_SETUP_MAX];
 535
 536 /**
 537  *      netdev_boot_setup_add   - add new setup entry
 538  *      @name: name of the device
 539  *      @map: configured settings for the device
 540  *
 541  *      Adds new setup entry to the dev_boot_setup list.  The function
 542  *      returns 0 on error and 1 on success.  This is a generic routine to
 543  *      all netdevices.
 544  */
 545 static int netdev_boot_setup_add(char *name, struct ifmap *map)
 546 {
 547         struct netdev_boot_setup *s;
 548         int i;
 549
 550         s = dev_boot_setup;
 551         for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++) {
 552                 if (s[i].name[0] == '\0' || s[i].name[0] == ' ') {
 553                         memset(s[i].name, 0, sizeof(s[i].name));
 554                         strlcpy(s[i].name, name, IFNAMSIZ);
 555                         memcpy(&s[i].map, map, sizeof(s[i].map));
 556                         break;
 557                 }
 558         }
 559
 560         return i >= NETDEV_BOOT_SETUP_MAX ? 0 : 1;
 561 }
 562
 563 /**
 564  *      netdev_boot_setup_check - check boot time settings
 565  *      @dev: the netdevice
 566  *
 567  *      Check boot time settings for the device.
 568  *      The found settings are set for the device to be used
 569  *      later in the device probing.
 570  *      Returns 0 if no settings found, 1 if they are.
 571  */
 572 int netdev_boot_setup_check(struct net_device *dev)
 573 {
 574         struct netdev_boot_setup *s = dev_boot_setup;
 575         int i;
 576
 577         for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++) {
 578                 if (s[i].name[0] != '\0' && s[i].name[0] != ' ' &&
 579                     !strcmp(dev->name, s[i].name)) {
 580                         dev->irq        = s[i].map.irq;
 581                         dev->base_addr  = s[i].map.base_addr;
 582                         dev->mem_start  = s[i].map.mem_start;
 583                         dev->mem_end    = s[i].map.mem_end;
 584                         return 1;
 585                 }
 586         }
 587         return 0;
 588 }
 589 EXPORT_SYMBOL(netdev_boot_setup_check);
 590
 591
 592 /**
 593  *      netdev_boot_base        - get address from boot time settings
 594  *      @prefix: prefix for network device
 595  *      @unit: id for network device
 596  *
 597  *      Check boot time settings for the base address of device.
 598  *      The found settings are set for the device to be used
 599  *      later in the device probing.
 600  *      Returns 0 if no settings found.
 601  */
 602 unsigned long netdev_boot_base(const char *prefix, int unit)
 603 {
 604         const struct netdev_boot_setup *s = dev_boot_setup;
 605         char name[IFNAMSIZ];
 606         int i;
 607
 608         sprintf(name, "%s%d", prefix, unit);
 609
 610         /*
 611          * If device already registered then return base of 1
 612          * to indicate not to probe for this interface
 613          */
 614         if (__dev_get_by_name(&init_net, name))
 615                 return 1;
 616
 617         for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++)
 618                 if (!strcmp(name, s[i].name))
 619                         return s[i].map.base_addr;
 620         return 0;
 621 }
 622
 623 /*
 624  * Saves at boot time configured settings for any netdevice.
 625  */
 626 int __init netdev_boot_setup(char *str)
 627 {
 628         int ints[5];
 629         struct ifmap map;
 630
 631         str = get_options(str, ARRAY_SIZE(ints), ints);
 632         if (!str || !*str)
 633                 return 0;
 634
 635         /* Save settings */
 636         memset(&map, 0, sizeof(map));
 637         if (ints[0] > 0)
 638                 map.irq = ints[1];
 639         if (ints[0] > 1)
 640                 map.base_addr = ints[2];
 641         if (ints[0] > 2)
 642                 map.mem_start = ints[3];
 643         if (ints[0] > 3)
 644                 map.mem_end = ints[4];
 645
 646         /* Add new entry to the list */
 647         return netdev_boot_setup_add(str, &map);
 648 }
 649
 650 __setup("netdev=", netdev_boot_setup);
 651
 652 /*******************************************************************************
 653
 654                             Device Interface Subroutines
 655
 656 *******************************************************************************/
 657
 658 /**
 659  *      __dev_get_by_name       - find a device by its name
 660  *      @net: the applicable net namespace
 661  *      @name: name to find
 662  *
 663  *      Find an interface by name. Must be called under RTNL semaphore
 664  *      or @dev_base_lock. If the name is found a pointer to the device
 665  *      is returned. If the name is not found then %NULL is returned. The
 666  *      reference counters are not incremented so the caller must be
 667  *      careful with locks.
 668  */
 669
 670 struct net_device *__dev_get_by_name(struct net *net, const char *name)
 671 {
 672         struct net_device *dev;
 673         struct hlist_head *head = dev_name_hash(net, name);
 674
 675         hlist_for_each_entry(dev, head, name_hlist)
 676                 if (!strncmp(dev->name, name, IFNAMSIZ))
 677                         return dev;
 678
 679         return NULL;
 680 }
 681 EXPORT_SYMBOL(__dev_get_by_name);
 682
 683 /**
 684  *      dev_get_by_name_rcu     - find a device by its name
 685  *      @net: the applicable net namespace
 686  *      @name: name to find
 687  *
 688  *      Find an interface by name.
 689  *      If the name is found a pointer to the device is returned.
 690  *      If the name is not found then %NULL is returned.
 691  *      The reference counters are not incremented so the caller must be
 692  *      careful with locks. The caller must hold RCU lock.
 693  */
 694
 695 struct net_device *dev_get_by_name_rcu(struct net *net, const char *name)
 696 {
 697         struct net_device *dev;
 698         struct hlist_head *head = dev_name_hash(net, name);
 699
 700         hlist_for_each_entry_rcu(dev, head, name_hlist)
 701                 if (!strncmp(dev->name, name, IFNAMSIZ))
 702                         return dev;
 703
 704         return NULL;
 705 }
 706 EXPORT_SYMBOL(dev_get_by_name_rcu);
 707
 708 /**
 709  *      dev_get_by_name         - find a device by its name
 710  *      @net: the applicable net namespace
 711  *      @name: name to find
 712  *
 713  *      Find an interface by name. This can be called from any
 714  *      context and does its own locking. The returned handle has
 715  *      the usage count incremented and the caller must use dev_put() to
 716  *      release it when it is no longer needed. %NULL is returned if no
 717  *      matching device is found.
 718  */
 719
 720 struct net_device *dev_get_by_name(struct net *net, const char *name)
 721 {
 722         struct net_device *dev;
 723
 724         rcu_read_lock();
 725         dev = dev_get_by_name_rcu(net, name);
 726         if (dev)
 727                 dev_hold(dev);
 728         rcu_read_unlock();
 729         return dev;
 730 }
 731 EXPORT_SYMBOL(dev_get_by_name);
 732
 733 /**
 734  *      __dev_get_by_index - find a device by its ifindex
 735  *      @net: the applicable net namespace
 736  *      @ifindex: index of device
 737  *
 738  *      Search for an interface by index. Returns %NULL if the device
 739  *      is not found or a pointer to the device. The device has not
 740  *      had its reference counter increased so the caller must be careful
 741  *      about locking. The caller must hold either the RTNL semaphore
 742  *      or @dev_base_lock.
 743  */
 744
 745 struct net_device *__dev_get_by_index(struct net *net, int ifindex)
 746 {
 747         struct net_device *dev;
 748         struct hlist_head *head = dev_index_hash(net, ifindex);
 749
 750         hlist_for_each_entry(dev, head, index_hlist)
 751                 if (dev->ifindex == ifindex)
 752                         return dev;
 753
 754         return NULL;
 755 }
 756 EXPORT_SYMBOL(__dev_get_by_index);
 757
 758 /**
 759  *      dev_get_by_index_rcu - find a device by its ifindex
 760  *      @net: the applicable net namespace
 761  *      @ifindex: index of device
 762  *
 763  *      Search for an interface by index. Returns %NULL if the device
 764  *      is not found or a pointer to the device. The device has not
 765  *      had its reference counter increased so the caller must be careful
 766  *      about locking. The caller must hold RCU lock.
 767  */
 768
 769 struct net_device *dev_get_by_index_rcu(struct net *net, int ifindex)
 770 {
 771         struct net_device *dev;
 772         struct hlist_head *head = dev_index_hash(net, ifindex);
 773
 774         hlist_for_each_entry_rcu(dev, head, index_hlist)
 775                 if (dev->ifindex == ifindex)
 776                         return dev;
 777
 778         return NULL;
 779 }
 780 EXPORT_SYMBOL(dev_get_by_index_rcu);
 781
 782
 783 /**
 784  *      dev_get_by_index - find a device by its ifindex
 785  *      @net: the applicable net namespace
 786  *      @ifindex: index of device
 787  *
 788  *      Search for an interface by index. Returns NULL if the device
 789  *      is not found or a pointer to the device. The device returned has
 790  *      had a reference added and the pointer is safe until the user calls
 791  *      dev_put to indicate they have finished with it.
 792  */
 793
 794 struct net_device *dev_get_by_index(struct net *net, int ifindex)
 795 {
 796         struct net_device *dev;
 797
 798         rcu_read_lock();
 799         dev = dev_get_by_index_rcu(net, ifindex);
 800         if (dev)
 801                 dev_hold(dev);
 802         rcu_read_unlock();
 803         return dev;
 804 }
 805 EXPORT_SYMBOL(dev_get_by_index);
 806
 807 /**
 808  *      netdev_get_name - get a netdevice name, knowing its ifindex.
 809  *      @net: network namespace
 810  *      @name: a pointer to the buffer where the name will be stored.
 811  *      @ifindex: the ifindex of the interface to get the name from.
 812  *
 813  *      The use of raw_seqcount_begin() and cond_resched() before
 814  *      retrying is required as we want to give the writers a chance
 815  *      to complete when CONFIG_PREEMPT is not set.
 816  */
 817 int netdev_get_name(struct net *net, char *name, int ifindex)
 818 {
 819         struct net_device *dev;
 820         unsigned int seq;
 821
 822 retry:
 823         seq = raw_seqcount_begin(&devnet_rename_seq);
 824         rcu_read_lock();
 825         dev = dev_get_by_index_rcu(net, ifindex);
 826         if (!dev) {
 827                 rcu_read_unlock();
 828                 return -ENODEV;
 829         }
 830
 831         strcpy(name, dev->name);
 832         rcu_read_unlock();
 833         if (read_seqcount_retry(&devnet_rename_seq, seq)) {
 834                 cond_resched();
 835                 goto retry;
 836         }
 837
 838         return 0;
 839 }
 840
 841 /**
 842  *      dev_getbyhwaddr_rcu - find a device by its hardware address
 843  *      @net: the applicable net namespace
 844  *      @type: media type of device
 845  *      @ha: hardware address
 846  *
 847  *      Search for an interface by MAC address. Returns NULL if the device
 848  *      is not found or a pointer to the device.
 849  *      The caller must hold RCU or RTNL.
 850  *      The returned device has not had its ref count increased
 851  *      and the caller must therefore be careful about locking
 852  *
 853  */
 854
 855 struct net_device *dev_getbyhwaddr_rcu(struct net *net, unsigned short type,
 856                                        const char *ha)
 857 {
 858         struct net_device *dev;
 859
 860         for_each_netdev_rcu(net, dev)
 861                 if (dev->type == type &&
 862                     !memcmp(dev->dev_addr, ha, dev->addr_len))
 863                         return dev;
 864
 865         return NULL;
 866 }
 867 EXPORT_SYMBOL(dev_getbyhwaddr_rcu);
 868
 869 struct net_device *__dev_getfirstbyhwtype(struct net *net, unsigned short type)
 870 {
 871         struct net_device *dev;
 872
 873         ASSERT_RTNL();
 874         for_each_netdev(net, dev)
 875                 if (dev->type == type)
 876                         return dev;
 877
 878         return NULL;
 879 }
 880 EXPORT_SYMBOL(__dev_getfirstbyhwtype);
 881
 882 struct net_device *dev_getfirstbyhwtype(struct net *net, unsigned short type)
 883 {
 884         struct net_device *dev, *ret = NULL;
 885
 886         rcu_read_lock();
 887         for_each_netdev_rcu(net, dev)
 888                 if (dev->type == type) {
 889                         dev_hold(dev);
 890                         ret = dev;
 891                         break;
 892                 }
 893         rcu_read_unlock();
 894         return ret;
 895 }
 896 EXPORT_SYMBOL(dev_getfirstbyhwtype);
 897
 898 /**
 899  *      dev_get_by_flags_rcu - find any device with given flags
 900  *      @net: the applicable net namespace
 901  *      @if_flags: IFF_* values
 902  *      @mask: bitmask of bits in if_flags to check
 903  *
 904  *      Search for any interface with the given flags. Returns NULL if a device
 905  *      is not found or a pointer to the device. Must be called inside
 906  *      rcu_read_lock(), and result refcount is unchanged.
 907  */
 908
 909 struct net_device *dev_get_by_flags_rcu(struct net *net, unsigned short if_flags,
 910                                     unsigned short mask)
 911 {
 912         struct net_device *dev, *ret;
 913
 914         ret = NULL;
 915         for_each_netdev_rcu(net, dev) {
 916                 if (((dev->flags ^ if_flags) & mask) == 0) {
 917                         ret = dev;
 918                         break;
 919                 }
 920         }
 921         return ret;
 922 }
 923 EXPORT_SYMBOL(dev_get_by_flags_rcu);
 924
 925 /**
 926  *      dev_valid_name - check if name is okay for network device
 927  *      @name: name string
 928  *
 929  *      Network device names need to be valid file names to
 930  *      to allow sysfs to work.  We also disallow any kind of
 931  *      whitespace.
 932  */
 933 bool dev_valid_name(const char *name)
 934 {
 935         if (*name == '\0')
 936                 return false;
 937         if (strlen(name) >= IFNAMSIZ)
 938                 return false;
 939         if (!strcmp(name, ".") || !strcmp(name, ".."))
 940                 return false;
 941
 942         while (*name) {
 943                 if (*name == '/' || *name == ':' || isspace(*name))
 944                         return false;
 945                 name++;
 946         }
 947         return true;
 948 }
 949 EXPORT_SYMBOL(dev_valid_name);
 950
 951 /**
 952  *      __dev_alloc_name - allocate a name for a device
 953  *      @net: network namespace to allocate the device name in
 954  *      @name: name format string
 955  *      @buf:  scratch buffer and result name string
 956  *
 957  *      Passed a format string - eg "lt%d" it will try and find a suitable
 958  *      id. It scans list of devices to build up a free map, then chooses
 959  *      the first empty slot. The caller must hold the dev_base or rtnl lock
 960  *      while allocating the name and adding the device in order to avoid
 961  *      duplicates.
 962  *      Limited to bits_per_byte * page size devices (ie 32K on most platforms).
 963  *      Returns the number of the unit assigned or a negative errno code.
 964  */
 965
 966 static int __dev_alloc_name(struct net *net, const char *name, char *buf)
 967 {
 968         int i = 0;
 969         const char *p;
 970         const int max_netdevices = 8*PAGE_SIZE;
 971         unsigned long *inuse;
 972         struct net_device *d;
 973
 974         p = strnchr(name, IFNAMSIZ-1, '%');
 975         if (p) {
 976                 /*
 977                  * Verify the string as this thing may have come from
 978                  * the user.  There must be either one "%d" and no other "%"
 979                  * characters.
 980                  */
 981                 if (p[1] != 'd' || strchr(p + 2, '%'))
 982                         return -EINVAL;
 983
 984                 /* Use one page as a bit array of possible slots */
 985                 inuse = (unsigned long *) get_zeroed_page(GFP_ATOMIC);
 986                 if (!inuse)
 987                         return -ENOMEM;
 988
 989                 for_each_netdev(net, d) {
 990                         if (!sscanf(d->name, name, &i))
 991                                 continue;
 992                         if (i < 0 || i >= max_netdevices)
 993                                 continue;
 994
 995                         /*  avoid cases where sscanf is not exact inverse of printf */
 996                         snprintf(buf, IFNAMSIZ, name, i);
 997                         if (!strncmp(buf, d->name, IFNAMSIZ))
 998                                 set_bit(i, inuse);
 999                 }
1000
1001                 i = find_first_zero_bit(inuse, max_netdevices);
1002                 free_page((unsigned long) inuse);
1003         }
1004
1005         if (buf != name)
1006                 snprintf(buf, IFNAMSIZ, name, i);
1007         if (!__dev_get_by_name(net, buf))
1008                 return i;
1009
1010         /* It is possible to run out of possible slots
1011          * when the name is long and there isn't enough space left
1012          * for the digits, or if all bits are used.
1013          */
1014         return -ENFILE;
1015 }
1016
1017 /**
1018  *      dev_alloc_name - allocate a name for a device
1019  *      @dev: device
1020  *      @name: name format string
1021  *
1022  *      Passed a format string - eg "lt%d" it will try and find a suitable
1023  *      id. It scans list of devices to build up a free map, then chooses
1024  *      the first empty slot. The caller must hold the dev_base or rtnl lock
1025  *      while allocating the name and adding the device in order to avoid
1026  *      duplicates.
1027  *      Limited to bits_per_byte * page size devices (ie 32K on most platforms).
1028  *      Returns the number of the unit assigned or a negative errno code.
1029  */
1030
1031 int dev_alloc_name(struct net_device *dev, const char *name)
1032 {
1033         char buf[IFNAMSIZ];
1034         struct net *net;
1035         int ret;
1036
1037         BUG_ON(!dev_net(dev));
1038         net = dev_net(dev);
1039         ret = __dev_alloc_name(net, name, buf);
1040         if (ret >= 0)
1041                 strlcpy(dev->name, buf, IFNAMSIZ);
1042         return ret;
1043 }
1044 EXPORT_SYMBOL(dev_alloc_name);
1045
1046 static int dev_alloc_name_ns(struct net *net,
1047                              struct net_device *dev,
1048                              const char *name)
1049 {
1050         char buf[IFNAMSIZ];
1051         int ret;
1052
1053         ret = __dev_alloc_name(net, name, buf);
1054         if (ret >= 0)
1055                 strlcpy(dev->name, buf, IFNAMSIZ);
1056         return ret;
1057 }
1058
1059 int dev_get_valid_name(struct net *net, struct net_device *dev,
1060                        const char *name)
1061 {
1062         BUG_ON(!net);
1063
1064         if (!dev_valid_name(name))
1065                 return -EINVAL;
1066
1067         if (strchr(name, '%'))
1068                 return dev_alloc_name_ns(net, dev, name);
1069         else if (__dev_get_by_name(net, name))
1070                 return -EEXIST;
1071         else if (dev->name != name)
1072                 strlcpy(dev->name, name, IFNAMSIZ);
1073
1074         return 0;
1075 }
1076 EXPORT_SYMBOL(dev_get_valid_name);
1077
1078 /**
1079  *      dev_change_name - change name of a device
1080  *      @dev: device
1081  *      @newname: name (or format string) must be at least IFNAMSIZ
1082  *
1083  *      Change name of a device, can pass format strings "eth%d".
1084  *      for wildcarding.
1085  */
1086 int dev_change_name(struct net_device *dev, const char *newname)
1087 {
1088         char oldname[IFNAMSIZ];
1089         int err = 0;
1090         int ret;
1091         struct net *net;
1092
1093         ASSERT_RTNL();
1094         BUG_ON(!dev_net(dev));
1095
1096         net = dev_net(dev);
1097         if (dev->flags & IFF_UP)
1098                 return -EBUSY;
1099
1100         write_seqcount_begin(&devnet_rename_seq);
1101
1102         if (strncmp(newname, dev->name, IFNAMSIZ) == 0) {
1103                 write_seqcount_end(&devnet_rename_seq);
1104                 return 0;
1105         }
1106
1107         memcpy(oldname, dev->name, IFNAMSIZ);
1108
1109         err = dev_get_valid_name(net, dev, newname);
1110         if (err < 0) {
1111                 write_seqcount_end(&devnet_rename_seq);
1112                 return err;
1113         }
1114
1115 rollback:
1116         ret = device_rename(&dev->dev, dev->name);
1117         if (ret) {
1118                 memcpy(dev->name, oldname, IFNAMSIZ);
1119                 write_seqcount_end(&devnet_rename_seq);
1120                 return ret;
1121         }
1122
1123         write_seqcount_end(&devnet_rename_seq);
1124
1125         netdev_adjacent_rename_links(dev, oldname);
1126
1127         write_lock_bh(&dev_base_lock);
1128         hlist_del_rcu(&dev->name_hlist);
1129         write_unlock_bh(&dev_base_lock);
1130
1131         synchronize_rcu();
1132
1133         write_lock_bh(&dev_base_lock);
1134         hlist_add_head_rcu(&dev->name_hlist, dev_name_hash(net, dev->name));
1135         write_unlock_bh(&dev_base_lock);
1136
1137         ret = call_netdevice_notifiers(NETDEV_CHANGENAME, dev);
1138         ret = notifier_to_errno(ret);
1139
1140         if (ret) {
1141                 /* err >= 0 after dev_alloc_name() or stores the first errno */
1142                 if (err >= 0) {
1143                         err = ret;
1144                         write_seqcount_begin(&devnet_rename_seq);
1145                         memcpy(dev->name, oldname, IFNAMSIZ);
1146                         memcpy(oldname, newname, IFNAMSIZ);
1147                         goto rollback;
1148                 } else {
1149                         pr_err("%s: name change rollback failed: %d\n",
1150                                dev->name, ret);
1151                 }
1152         }
1153
1154         return err;
1155 }
1156
1157 /**
1158  *      dev_set_alias - change ifalias of a device
1159  *      @dev: device
1160  *      @alias: name up to IFALIASZ
1161  *      @len: limit of bytes to copy from info
1162  *
1163  *      Set ifalias for a device,
1164  */
1165 int dev_set_alias(struct net_device *dev, const char *alias, size_t len)
1166 {
1167         char *new_ifalias;
1168
1169         ASSERT_RTNL();
1170
1171         if (len >= IFALIASZ)
1172                 return -EINVAL;
1173
1174         if (!len) {
1175                 kfree(dev->ifalias);
1176                 dev->ifalias = NULL;
1177                 return 0;
1178         }
1179
1180         new_ifalias = krealloc(dev->ifalias, len + 1, GFP_KERNEL);
1181         if (!new_ifalias)
1182                 return -ENOMEM;
1183         dev->ifalias = new_ifalias;
1184
1185         strlcpy(dev->ifalias, alias, len+1);
1186         return len;
1187 }
1188
1189
1190 /**
1191  *      netdev_features_change - device changes features
1192  *      @dev: device to cause notification
1193  *
1194  *      Called to indicate a device has changed features.
1195  */
1196 void netdev_features_change(struct net_device *dev)
1197 {
1198         call_netdevice_notifiers(NETDEV_FEAT_CHANGE, dev);
1199 }
1200 EXPORT_SYMBOL(netdev_features_change);
1201
1202 /**
1203  *      netdev_state_change - device changes state
1204  *      @dev: device to cause notification
1205  *
1206  *      Called to indicate a device has changed state. This function calls
1207  *      the notifier chains for netdev_chain and sends a NEWLINK message
1208  *      to the routing socket.
1209  */
1210 void netdev_state_change(struct net_device *dev)
1211 {
1212         if (dev->flags & IFF_UP) {
1213                 struct netdev_notifier_change_info change_info;
1214
1215                 change_info.flags_changed = 0;
1216                 call_netdevice_notifiers_info(NETDEV_CHANGE, dev,
1217                                               &change_info.info);
1218                 rtmsg_ifinfo(RTM_NEWLINK, dev, 0, GFP_KERNEL);
1219         }
1220 }
1221 EXPORT_SYMBOL(netdev_state_change);
1222
1223 /**
1224  *      netdev_notify_peers - notify network peers about existence of @dev
1225  *      @dev: network device
1226  *
1227  * Generate traffic such that interested network peers are aware of
1228  * @dev, such as by generating a gratuitous ARP. This may be used when
1229  * a device wants to inform the rest of the network about some sort of
1230  * reconfiguration such as a failover event or virtual machine
1231  * migration.
1232  */
1233 void netdev_notify_peers(struct net_device *dev)
1234 {
1235         rtnl_lock();
1236         call_netdevice_notifiers(NETDEV_NOTIFY_PEERS, dev);
1237         rtnl_unlock();
1238 }
1239 EXPORT_SYMBOL(netdev_notify_peers);
1240
1241 static int __dev_open(struct net_device *dev)
1242 {
1243         const struct net_device_ops *ops = dev->netdev_ops;
1244         int ret;
1245
1246         ASSERT_RTNL();
1247
1248         if (!netif_device_present(dev))
1249                 return -ENODEV;
1250
1251         /* Block netpoll from trying to do any rx path servicing.
1252          * If we don't do this there is a chance ndo_poll_controller
1253          * or ndo_poll may be running while we open the device
1254          */
1255         netpoll_poll_disable(dev);
1256
1257         ret = call_netdevice_notifiers(NETDEV_PRE_UP, dev);
1258         ret = notifier_to_errno(ret);
1259         if (ret)
1260                 return ret;
1261
1262         set_bit(__LINK_STATE_START, &dev->state);
1263
1264         if (ops->ndo_validate_addr)
1265                 ret = ops->ndo_validate_addr(dev);
1266
1267         if (!ret && ops->ndo_open)
1268                 ret = ops->ndo_open(dev);
1269
1270         netpoll_poll_enable(dev);
1271
1272         if (ret)
1273                 clear_bit(__LINK_STATE_START, &dev->state);
1274         else {
1275                 dev->flags |= IFF_UP;
1276                 net_dmaengine_get();
1277                 dev_set_rx_mode(dev);
1278                 dev_activate(dev);
1279                 add_device_randomness(dev->dev_addr, dev->addr_len);
1280         }
1281
1282         return ret;
1283 }
1284
1285 /**
1286  *      dev_open        - prepare an interface for use.
1287  *      @dev:   device to open
1288  *
1289  *      Takes a device from down to up state. The device's private open
1290  *      function is invoked and then the multicast lists are loaded. Finally
1291  *      the device is moved into the up state and a %NETDEV_UP message is
1292  *      sent to the netdev notifier chain.
1293  *
1294  *      Calling this function on an active interface is a nop. On a failure
1295  *      a negative errno code is returned.
1296  */
1297 int dev_open(struct net_device *dev)
1298 {
1299         int ret;
1300
1301         if (dev->flags & IFF_UP)
1302                 return 0;
1303
1304         ret = __dev_open(dev);
1305         if (ret < 0)
1306                 return ret;
1307
1308         rtmsg_ifinfo(RTM_NEWLINK, dev, IFF_UP|IFF_RUNNING, GFP_KERNEL);
1309         call_netdevice_notifiers(NETDEV_UP, dev);
1310
1311         return ret;
1312 }
1313 EXPORT_SYMBOL(dev_open);
1314
1315 static int __dev_close_many(struct list_head *head)
1316 {
1317         struct net_device *dev;
1318
1319         ASSERT_RTNL();
1320         might_sleep();
1321
1322         list_for_each_entry(dev, head, close_list) {
1323                 /* Temporarily disable netpoll until the interface is down */
1324                 netpoll_poll_disable(dev);
1325
1326                 call_netdevice_notifiers(NETDEV_GOING_DOWN, dev);
1327
1328                 clear_bit(__LINK_STATE_START, &dev->state);
1329
1330                 /* Synchronize to scheduled poll. We cannot touch poll list, it
1331                  * can be even on different cpu. So just clear netif_running().
1332                  *
1333                  * dev->stop() will invoke napi_disable() on all of it's
1334                  * napi_struct instances on this device.
1335                  */
1336                 smp_mb__after_atomic(); /* Commit netif_running(). */
1337         }
1338
1339         dev_deactivate_many(head);
1340
1341         list_for_each_entry(dev, head, close_list) {
1342                 const struct net_device_ops *ops = dev->netdev_ops;
1343
1344                 /*
1345                  *      Call the device specific close. This cannot fail.
1346                  *      Only if device is UP
1347                  *
1348                  *      We allow it to be called even after a DETACH hot-plug
1349                  *      event.
1350                  */
1351                 if (ops->ndo_stop)
1352                         ops->ndo_stop(dev);
1353
1354                 dev->flags &= ~IFF_UP;
1355                 net_dmaengine_put();
1356                 netpoll_poll_enable(dev);
1357         }
1358
1359         return 0;
1360 }
1361
1362 static int __dev_close(struct net_device *dev)
1363 {
1364         int retval;
1365         LIST_HEAD(single);
1366
1367         list_add(&dev->close_list, &single);
1368         retval = __dev_close_many(&single);
1369         list_del(&single);
1370
1371         return retval;
1372 }
1373
1374 static int dev_close_many(struct list_head *head)
1375 {
1376         struct net_device *dev, *tmp;
1377
1378         /* Remove the devices that don't need to be closed */
1379         list_for_each_entry_safe(dev, tmp, head, close_list)
1380                 if (!(dev->flags & IFF_UP))
1381                         list_del_init(&dev->close_list);
1382
1383         __dev_close_many(head);
1384
1385         list_for_each_entry_safe(dev, tmp, head, close_list) {
1386                 rtmsg_ifinfo(RTM_NEWLINK, dev, IFF_UP|IFF_RUNNING, GFP_KERNEL);
1387                 call_netdevice_notifiers(NETDEV_DOWN, dev);
1388                 list_del_init(&dev->close_list);
1389         }
1390
1391         return 0;
1392 }
1393
1394 /**
1395  *      dev_close - shutdown an interface.
1396  *      @dev: device to shutdown
1397  *
1398  *      This function moves an active device into down state. A
1399  *      %NETDEV_GOING_DOWN is sent to the netdev notifier chain. The device
1400  *      is then deactivated and finally a %NETDEV_DOWN is sent to the notifier
1401  *      chain.
1402  */
1403 int dev_close(struct net_device *dev)
1404 {
1405         if (dev->flags & IFF_UP) {
1406                 LIST_HEAD(single);
1407
1408                 list_add(&dev->close_list, &single);
1409                 dev_close_many(&single);
1410                 list_del(&single);
1411         }
1412         return 0;
1413 }
1414 EXPORT_SYMBOL(dev_close);
1415
1416
1417 /**
1418  *      dev_disable_lro - disable Large Receive Offload on a device
1419  *      @dev: device
1420  *
1421  *      Disable Large Receive Offload (LRO) on a net device.  Must be
1422  *      called under RTNL.  This is needed if received packets may be
1423  *      forwarded to another interface.
1424  */
1425 void dev_disable_lro(struct net_device *dev)
1426 {
1427         /*
1428          * If we're trying to disable lro on a vlan device
1429          * use the underlying physical device instead
1430          */
1431         if (is_vlan_dev(dev))
1432                 dev = vlan_dev_real_dev(dev);
1433
1434         /* the same for macvlan devices */
1435         if (netif_is_macvlan(dev))
1436                 dev = macvlan_dev_real_dev(dev);
1437
1438         dev->wanted_features &= ~NETIF_F_LRO;
1439         netdev_update_features(dev);
1440
1441         if (unlikely(dev->features & NETIF_F_LRO))
1442                 netdev_WARN(dev, "failed to disable LRO!\n");
1443 }
1444 EXPORT_SYMBOL(dev_disable_lro);
1445
1446 static int call_netdevice_notifier(struct notifier_block *nb, unsigned long val,
1447                                    struct net_device *dev)
1448 {
1449         struct netdev_notifier_info info;
1450
1451         netdev_notifier_info_init(&info, dev);
1452         return nb->notifier_call(nb, val, &info);
1453 }
1454
1455 static int dev_boot_phase = 1;
1456
1457 /**
1458  *      register_netdevice_notifier - register a network notifier block
1459  *      @nb: notifier
1460  *
1461  *      Register a notifier to be called when network device events occur.
1462  *      The notifier passed is linked into the kernel structures and must
1463  *      not be reused until it has been unregistered. A negative errno code
1464  *      is returned on a failure.
1465  *
1466  *      When registered all registration and up events are replayed
1467  *      to the new notifier to allow device to have a race free
1468  *      view of the network device list.
1469  */
1470
1471 int register_netdevice_notifier(struct notifier_block *nb)
1472 {
1473         struct net_device *dev;
1474         struct net_device *last;
1475         struct net *net;
1476         int err;
1477
1478         rtnl_lock();
1479         err = raw_notifier_chain_register(&netdev_chain, nb);
1480         if (err)
1481                 goto unlock;
1482         if (dev_boot_phase)
1483                 goto unlock;
1484         for_each_net(net) {
1485                 for_each_netdev(net, dev) {
1486                         err = call_netdevice_notifier(nb, NETDEV_REGISTER, dev);
1487                         err = notifier_to_errno(err);
1488                         if (err)
1489                                 goto rollback;
1490
1491                         if (!(dev->flags & IFF_UP))
1492                                 continue;
1493
1494                         call_netdevice_notifier(nb, NETDEV_UP, dev);
1495                 }
1496         }
1497
1498 unlock:
1499         rtnl_unlock();
1500         return err;
1501
1502 rollback:
1503         last = dev;
1504         for_each_net(net) {
1505                 for_each_netdev(net, dev) {
1506                         if (dev == last)
1507                                 goto outroll;
1508
1509                         if (dev->flags & IFF_UP) {
1510                                 call_netdevice_notifier(nb, NETDEV_GOING_DOWN,
1511                                                         dev);
1512                                 call_netdevice_notifier(nb, NETDEV_DOWN, dev);
1513                         }
1514                         call_netdevice_notifier(nb, NETDEV_UNREGISTER, dev);
1515                 }
1516         }
1517
1518 outroll:
1519         raw_notifier_chain_unregister(&netdev_chain, nb);
1520         goto unlock;
1521 }
1522 EXPORT_SYMBOL(register_netdevice_notifier);
1523
1524 /**
1525  *      unregister_netdevice_notifier - unregister a network notifier block
1526  *      @nb: notifier
1527  *
1528  *      Unregister a notifier previously registered by
1529  *      register_netdevice_notifier(). The notifier is unlinked into the
1530  *      kernel structures and may then be reused. A negative errno code
1531  *      is returned on a failure.
1532  *
1533  *      After unregistering unregister and down device events are synthesized
1534  *      for all devices on the device list to the removed notifier to remove
1535  *      the need for special case cleanup code.
1536  */
1537
1538 int unregister_netdevice_notifier(struct notifier_block *nb)
1539 {
1540         struct net_device *dev;
1541         struct net *net;
1542         int err;
1543
1544         rtnl_lock();
1545         err = raw_notifier_chain_unregister(&netdev_chain, nb);
1546         if (err)
1547                 goto unlock;
1548
1549         for_each_net(net) {
1550                 for_each_netdev(net, dev) {
1551                         if (dev->flags & IFF_UP) {
1552                                 call_netdevice_notifier(nb, NETDEV_GOING_DOWN,
1553                                                         dev);
1554                                 call_netdevice_notifier(nb, NETDEV_DOWN, dev);
1555                         }
1556                         call_netdevice_notifier(nb, NETDEV_UNREGISTER, dev);
1557                 }
1558         }
1559 unlock:
1560         rtnl_unlock();
1561         return err;
1562 }
1563 EXPORT_SYMBOL(unregister_netdevice_notifier);
1564
1565 /**
1566  *      call_netdevice_notifiers_info - call all network notifier blocks
1567  *      @val: value passed unmodified to notifier function
1568  *      @dev: net_device pointer passed unmodified to notifier function
1569  *      @info: notifier information data
1570  *
1571  *      Call all network notifier blocks.  Parameters and return value
1572  *      are as for raw_notifier_call_chain().
1573  */
1574
1575 static int call_netdevice_notifiers_info(unsigned long val,
1576                                          struct net_device *dev,
1577                                          struct netdev_notifier_info *info)
1578 {
1579         ASSERT_RTNL();
1580         netdev_notifier_info_init(info, dev);
1581         return raw_notifier_call_chain(&netdev_chain, val, info);
1582 }
1583
1584 /**
1585  *      call_netdevice_notifiers - call all network notifier blocks
1586  *      @val: value passed unmodified to notifier function
1587  *      @dev: net_device pointer passed unmodified to notifier function
1588  *
1589  *      Call all network notifier blocks.  Parameters and return value
1590  *      are as for raw_notifier_call_chain().
1591  */
1592
1593 int call_netdevice_notifiers(unsigned long val, struct net_device *dev)
1594 {
1595         struct netdev_notifier_info info;
1596
1597         return call_netdevice_notifiers_info(val, dev, &info);
1598 }
1599 EXPORT_SYMBOL(call_netdevice_notifiers);
1600
1601 static struct static_key netstamp_needed __read_mostly;
1602 #ifdef HAVE_JUMP_LABEL
1603 static atomic_t netstamp_needed_deferred;
1604 static atomic_t netstamp_wanted;
1605 static void netstamp_clear(struct work_struct *work)
1606 {
1607         int deferred = atomic_xchg(&netstamp_needed_deferred, 0);
1608         int wanted;
1609
1610         wanted = atomic_add_return(deferred, &netstamp_wanted);
1611         if (wanted > 0)
1612                 static_key_enable(&netstamp_needed);
1613         else
1614                 static_key_disable(&netstamp_needed);
1615 }
1616 static DECLARE_WORK(netstamp_work, netstamp_clear);
1617 #endif
1618
1619 void net_enable_timestamp(void)
1620 {
1621 #ifdef HAVE_JUMP_LABEL
1622         int wanted;
1623
1624         while (1) {
1625                 wanted = atomic_read(&netstamp_wanted);
1626                 if (wanted <= 0)
1627                         break;
1628                 if (atomic_cmpxchg(&netstamp_wanted, wanted, wanted + 1) == wanted)
1629                         return;
1630         }
1631         atomic_inc(&netstamp_needed_deferred);
1632         schedule_work(&netstamp_work);
1633 #else
1634         static_key_slow_inc(&netstamp_needed);
1635 #endif
1636 }
1637 EXPORT_SYMBOL(net_enable_timestamp);
1638
1639 void net_disable_timestamp(void)
1640 {
1641 #ifdef HAVE_JUMP_LABEL
1642         int wanted;
1643
1644         while (1) {
1645                 wanted = atomic_read(&netstamp_wanted);
1646                 if (wanted <= 1)
1647                         break;
1648                 if (atomic_cmpxchg(&netstamp_wanted, wanted, wanted - 1) == wanted)
1649                         return;
1650         }
1651         atomic_dec(&netstamp_needed_deferred);
1652         schedule_work(&netstamp_work);
1653 #else
1654         static_key_slow_dec(&netstamp_needed);
1655 #endif
1656 }
1657 EXPORT_SYMBOL(net_disable_timestamp);
1658
1659 static inline void net_timestamp_set(struct sk_buff *skb)
1660 {
1661         skb->tstamp.tv64 = 0;
1662         if (static_key_false(&netstamp_needed))
1663                 __net_timestamp(skb);
1664 }
1665
1666 #define net_timestamp_check(COND, SKB)                  \
1667         if (static_key_false(&netstamp_needed)) {               \
1668                 if ((COND) && !(SKB)->tstamp.tv64)      \
1669                         __net_timestamp(SKB);           \
1670         }                                               \
1671
1672 bool is_skb_forwardable(struct net_device *dev, struct sk_buff *skb)
1673 {
1674         unsigned int len;
1675
1676         if (!(dev->flags & IFF_UP))
1677                 return false;
1678
1679         len = dev->mtu + dev->hard_header_len + VLAN_HLEN;
1680         if (skb->len <= len)
1681                 return true;
1682
1683         /* if TSO is enabled, we don't care about the length as the packet
1684          * could be forwarded without being segmented before
1685          */
1686         if (skb_is_gso(skb))
1687                 return true;
1688
1689         return false;
1690 }
1691 EXPORT_SYMBOL_GPL(is_skb_forwardable);
1692
1693 int __dev_forward_skb(struct net_device *dev, struct sk_buff *skb)
1694 {
1695         if (skb_shinfo(skb)->tx_flags & SKBTX_DEV_ZEROCOPY) {
1696                 if (skb_copy_ubufs(skb, GFP_ATOMIC)) {
1697                         atomic_long_inc(&dev->rx_dropped);
1698                         kfree_skb(skb);
1699                         return NET_RX_DROP;
1700                 }
1701         }
1702
1703         if (unlikely(!is_skb_forwardable(dev, skb))) {
1704                 atomic_long_inc(&dev->rx_dropped);
1705                 kfree_skb(skb);
1706                 return NET_RX_DROP;
1707         }
1708
1709         skb_scrub_packet(skb, true);
1710         skb->protocol = eth_type_trans(skb, dev);
1711         skb_postpull_rcsum(skb, eth_hdr(skb), ETH_HLEN);
1712
1713         return 0;
1714 }
1715 EXPORT_SYMBOL_GPL(__dev_forward_skb);
1716
1717 /**
1718  * dev_forward_skb - loopback an skb to another netif
1719  *
1720  * @dev: destination network device
1721  * @skb: buffer to forward
1722  *
1723  * return values:
1724  *      NET_RX_SUCCESS  (no congestion)
1725  *      NET_RX_DROP     (packet was dropped, but freed)
1726  *
1727  * dev_forward_skb can be used for injecting an skb from the
1728  * start_xmit function of one device into the receive queue
1729  * of another device.
1730  *
1731  * The receiving device may be in another namespace, so
1732  * we have to clear all information in the skb that could
1733  * impact namespace isolation.
1734  */
1735 int dev_forward_skb(struct net_device *dev, struct sk_buff *skb)
1736 {
1737         return __dev_forward_skb(dev, skb) ?: netif_rx_internal(skb);
1738 }
1739 EXPORT_SYMBOL_GPL(dev_forward_skb);
1740
1741 static inline int deliver_skb(struct sk_buff *skb,
1742                               struct packet_type *pt_prev,
1743                               struct net_device *orig_dev)
1744 {
1745         if (unlikely(skb_orphan_frags(skb, GFP_ATOMIC)))
1746                 return -ENOMEM;
1747         atomic_inc(&skb->users);
1748         return pt_prev->func(skb, skb->dev, pt_prev, orig_dev);
1749 }
1750
1751 static inline bool skb_loop_sk(struct packet_type *ptype, struct sk_buff *skb)
1752 {
1753         if (!ptype->af_packet_priv || !skb->sk)
1754                 return false;
1755
1756         if (ptype->id_match)
1757                 return ptype->id_match(ptype, skb->sk);
1758         else if ((struct sock *)ptype->af_packet_priv == skb->sk)
1759                 return true;
1760
1761         return false;
1762 }
1763
1764 /*
1765  *      Support routine. Sends outgoing frames to any network
1766  *      taps currently in use.
1767  */
1768
1769 static void dev_queue_xmit_nit(struct sk_buff *skb, struct net_device *dev)
1770 {
1771         struct packet_type *ptype;
1772         struct sk_buff *skb2 = NULL;
1773         struct packet_type *pt_prev = NULL;
1774
1775         rcu_read_lock();
1776         list_for_each_entry_rcu(ptype, &ptype_all, list) {
1777                 /* Never send packets back to the socket
1778                  * they originated from - MvS (miquels@drinkel.ow.org)
1779                  */
1780                 if ((ptype->dev == dev || !ptype->dev) &&
1781                     (!skb_loop_sk(ptype, skb))) {
1782                         if (pt_prev) {
1783                                 deliver_skb(skb2, pt_prev, skb->dev);
1784                                 pt_prev = ptype;
1785                                 continue;
1786                         }
1787
1788                         skb2 = skb_clone(skb, GFP_ATOMIC);
1789                         if (!skb2)
1790                                 break;
1791
1792                         net_timestamp_set(skb2);
1793
1794                         /* skb->nh should be correctly
1795                            set by sender, so that the second statement is
1796                            just protection against buggy protocols.
1797                          */
1798                         skb_reset_mac_header(skb2);
1799
1800                         if (skb_network_header(skb2) < skb2->data ||
1801                             skb_network_header(skb2) > skb_tail_pointer(skb2)) {
1802                                 net_crit_ratelimited("protocol %04x is buggy, dev %s\n",
1803                                                      ntohs(skb2->protocol),
1804                                                      dev->name);
1805                                 skb_reset_network_header(skb2);
1806                         }
1807
1808                         skb2->transport_header = skb2->network_header;
1809                         skb2->pkt_type = PACKET_OUTGOING;
1810                         pt_prev = ptype;
1811                 }
1812         }
1813         if (pt_prev)
1814                 pt_prev->func(skb2, skb->dev, pt_prev, skb->dev);
1815         rcu_read_unlock();
1816 }
1817
1818 /**
1819  * netif_setup_tc - Handle tc mappings on real_num_tx_queues change
1820  * @dev: Network device
1821  * @txq: number of queues available
1822  *
1823  * If real_num_tx_queues is changed the tc mappings may no longer be
1824  * valid. To resolve this verify the tc mapping remains valid and if
1825  * not NULL the mapping. With no priorities mapping to this
1826  * offset/count pair it will no longer be used. In the worst case TC0
1827  * is invalid nothing can be done so disable priority mappings. If is
1828  * expected that drivers will fix this mapping if they can before
1829  * calling netif_set_real_num_tx_queues.
1830  */
1831 static void netif_setup_tc(struct net_device *dev, unsigned int txq)
1832 {
1833         int i;
1834         struct netdev_tc_txq *tc = &dev->tc_to_txq[0];
1835
1836         /* If TC0 is invalidated disable TC mapping */
1837         if (tc->offset + tc->count > txq) {
1838                 pr_warn("Number of in use tx queues changed invalidating tc mappings. Priority traffic classification disabled!\n");
1839                 dev->num_tc = 0;
1840                 return;
1841         }
1842
1843         /* Invalidated prio to tc mappings set to TC0 */
1844         for (i = 1; i < TC_BITMASK + 1; i++) {
1845                 int q = netdev_get_prio_tc_map(dev, i);
1846
1847                 tc = &dev->tc_to_txq[q];
1848                 if (tc->offset + tc->count > txq) {
1849                         pr_warn("Number of in use tx queues changed. Priority %i to tc mapping %i is no longer valid. Setting map to 0\n",
1850                                 i, q);
1851                         netdev_set_prio_tc_map(dev, i, 0);
1852                 }
1853         }
1854 }
1855
1856 #ifdef CONFIG_XPS
1857 static DEFINE_MUTEX(xps_map_mutex);
1858 #define xmap_dereference(P)             \
1859         rcu_dereference_protected((P), lockdep_is_held(&xps_map_mutex))
1860
1861 static struct xps_map *remove_xps_queue(struct xps_dev_maps *dev_maps,
1862                                         int cpu, u16 index)
1863 {
1864         struct xps_map *map = NULL;
1865         int pos;
1866
1867         if (dev_maps)
1868                 map = xmap_dereference(dev_maps->cpu_map[cpu]);
1869
1870         for (pos = 0; map && pos < map->len; pos++) {
1871                 if (map->queues[pos] == index) {
1872                         if (map->len > 1) {
1873                                 map->queues[pos] = map->queues[--map->len];
1874                         } else {
1875                                 RCU_INIT_POINTER(dev_maps->cpu_map[cpu], NULL);
1876                                 kfree_rcu(map, rcu);
1877                                 map = NULL;
1878                         }
1879                         break;
1880                 }
1881         }
1882
1883         return map;
1884 }
1885
1886 static void netif_reset_xps_queues_gt(struct net_device *dev, u16 index)
1887 {
1888         struct xps_dev_maps *dev_maps;
1889         int cpu, i;
1890         bool active = false;
1891
1892         mutex_lock(&xps_map_mutex);
1893         dev_maps = xmap_dereference(dev->xps_maps);
1894
1895         if (!dev_maps)
1896                 goto out_no_maps;
1897
1898         for_each_possible_cpu(cpu) {
1899                 for (i = index; i < dev->num_tx_queues; i++) {
1900                         if (!remove_xps_queue(dev_maps, cpu, i))
1901                                 break;
1902                 }
1903                 if (i == dev->num_tx_queues)
1904                         active = true;
1905         }
1906
1907         if (!active) {
1908                 RCU_INIT_POINTER(dev->xps_maps, NULL);
1909                 kfree_rcu(dev_maps, rcu);
1910         }
1911
1912         for (i = index; i < dev->num_tx_queues; i++)
1913                 netdev_queue_numa_node_write(netdev_get_tx_queue(dev, i),
1914                                              NUMA_NO_NODE);
1915
1916 out_no_maps:
1917         mutex_unlock(&xps_map_mutex);
1918 }
1919
1920 static struct xps_map *expand_xps_map(struct xps_map *map,
1921                                       int cpu, u16 index)
1922 {
1923         struct xps_map *new_map;
1924         int alloc_len = XPS_MIN_MAP_ALLOC;
1925         int i, pos;
1926
1927         for (pos = 0; map && pos < map->len; pos++) {
1928                 if (map->queues[pos] != index)
1929                         continue;
1930                 return map;
1931         }
1932
1933         /* Need to add queue to this CPU's existing map */
1934         if (map) {
1935                 if (pos < map->alloc_len)
1936                         return map;
1937
1938                 alloc_len = map->alloc_len * 2;
1939         }
1940
1941         /* Need to allocate new map to store queue on this CPU's map */
1942         new_map = kzalloc_node(XPS_MAP_SIZE(alloc_len), GFP_KERNEL,
1943                                cpu_to_node(cpu));
1944         if (!new_map)
1945                 return NULL;
1946
1947         for (i = 0; i < pos; i++)
1948                 new_map->queues[i] = map->queues[i];
1949         new_map->alloc_len = alloc_len;
1950         new_map->len = pos;
1951
1952         return new_map;
1953 }
1954
1955 int netif_set_xps_queue(struct net_device *dev, const struct cpumask *mask,
1956                         u16 index)
1957 {
1958         struct xps_dev_maps *dev_maps, *new_dev_maps = NULL;
1959         struct xps_map *map, *new_map;
1960         int maps_sz = max_t(unsigned int, XPS_DEV_MAPS_SIZE, L1_CACHE_BYTES);
1961         int cpu, numa_node_id = -2;
1962         bool active = false;
1963
1964         mutex_lock(&xps_map_mutex);
1965
1966         dev_maps = xmap_dereference(dev->xps_maps);
1967
1968         /* allocate memory for queue storage */
1969         for_each_online_cpu(cpu) {
1970                 if (!cpumask_test_cpu(cpu, mask))
1971                         continue;
1972
1973                 if (!new_dev_maps)
1974                         new_dev_maps = kzalloc(maps_sz, GFP_KERNEL);
1975                 if (!new_dev_maps) {
1976                         mutex_unlock(&xps_map_mutex);
1977                         return -ENOMEM;
1978                 }
1979
1980                 map = dev_maps ? xmap_dereference(dev_maps->cpu_map[cpu]) :
1981                                  NULL;
1982
1983                 map = expand_xps_map(map, cpu, index);
1984                 if (!map)
1985                         goto error;
1986
1987                 RCU_INIT_POINTER(new_dev_maps->cpu_map[cpu], map);
1988         }
1989
1990         if (!new_dev_maps)
1991                 goto out_no_new_maps;
1992
1993         for_each_possible_cpu(cpu) {
1994                 if (cpumask_test_cpu(cpu, mask) && cpu_online(cpu)) {
1995                         /* add queue to CPU maps */
1996                         int pos = 0;
1997
1998                         map = xmap_dereference(new_dev_maps->cpu_map[cpu]);
1999                         while ((pos < map->len) && (map->queues[pos] != index))
2000                                 pos++;
2001
2002                         if (pos == map->len)
2003                                 map->queues[map->len++] = index;
2004 #ifdef CONFIG_NUMA
2005                         if (numa_node_id == -2)
2006                                 numa_node_id = cpu_to_node(cpu);
2007                         else if (numa_node_id != cpu_to_node(cpu))
2008                                 numa_node_id = -1;
2009 #endif
2010                 } else if (dev_maps) {
2011                         /* fill in the new device map from the old device map */
2012                         map = xmap_dereference(dev_maps->cpu_map[cpu]);
2013                         RCU_INIT_POINTER(new_dev_maps->cpu_map[cpu], map);
2014                 }
2015
2016         }
2017
2018         rcu_assign_pointer(dev->xps_maps, new_dev_maps);
2019
2020         /* Cleanup old maps */
2021         if (dev_maps) {
2022                 for_each_possible_cpu(cpu) {
2023                         new_map = xmap_dereference(new_dev_maps->cpu_map[cpu]);
2024                         map = xmap_dereference(dev_maps->cpu_map[cpu]);
2025                         if (map && map != new_map)
2026                                 kfree_rcu(map, rcu);
2027                 }
2028
2029                 kfree_rcu(dev_maps, rcu);
2030         }
2031
2032         dev_maps = new_dev_maps;
2033         active = true;
2034
2035 out_no_new_maps:
2036         /* update Tx queue numa node */
2037         netdev_queue_numa_node_write(netdev_get_tx_queue(dev, index),
2038                                      (numa_node_id >= 0) ? numa_node_id :
2039                                      NUMA_NO_NODE);
2040
2041         if (!dev_maps)
2042                 goto out_no_maps;
2043
2044         /* removes queue from unused CPUs */
2045         for_each_possible_cpu(cpu) {
2046                 if (cpumask_test_cpu(cpu, mask) && cpu_online(cpu))
2047                         continue;
2048
2049                 if (remove_xps_queue(dev_maps, cpu, index))
2050                         active = true;
2051         }
2052
2053         /* free map if not active */
2054         if (!active) {
2055                 RCU_INIT_POINTER(dev->xps_maps, NULL);
2056                 kfree_rcu(dev_maps, rcu);
2057         }
2058
2059 out_no_maps:
2060         mutex_unlock(&xps_map_mutex);
2061
2062         return 0;
2063 error:
2064         /* remove any maps that we added */
2065         for_each_possible_cpu(cpu) {
2066                 new_map = xmap_dereference(new_dev_maps->cpu_map[cpu]);
2067                 map = dev_maps ? xmap_dereference(dev_maps->cpu_map[cpu]) :
2068                                  NULL;
2069                 if (new_map && new_map != map)
2070                         kfree(new_map);
2071         }
2072
2073         mutex_unlock(&xps_map_mutex);
2074
2075         kfree(new_dev_maps);
2076         return -ENOMEM;
2077 }
2078 EXPORT_SYMBOL(netif_set_xps_queue);
2079
2080 #endif
2081 /*
2082  * Routine to help set real_num_tx_queues. To avoid skbs mapped to queues
2083  * greater then real_num_tx_queues stale skbs on the qdisc must be flushed.
2084  */
2085 int netif_set_real_num_tx_queues(struct net_device *dev, unsigned int txq)
2086 {
2087         int rc;
2088
2089         if (txq < 1 || txq > dev->num_tx_queues)
2090                 return -EINVAL;
2091
2092         if (dev->reg_state == NETREG_REGISTERED ||
2093             dev->reg_state == NETREG_UNREGISTERING) {
2094                 ASSERT_RTNL();
2095
2096                 rc = netdev_queue_update_kobjects(dev, dev->real_num_tx_queues,
2097                                                   txq);
2098                 if (rc)
2099                         return rc;
2100
2101                 if (dev->num_tc)
2102                         netif_setup_tc(dev, txq);
2103
2104                 if (txq < dev->real_num_tx_queues) {
2105                         qdisc_reset_all_tx_gt(dev, txq);
2106 #ifdef CONFIG_XPS
2107                         netif_reset_xps_queues_gt(dev, txq);
2108 #endif
2109                 }
2110         }
2111
2112         dev->real_num_tx_queues = txq;
2113         return 0;
2114 }
2115 EXPORT_SYMBOL(netif_set_real_num_tx_queues);
2116
2117 #ifdef CONFIG_SYSFS
2118 /**
2119  *      netif_set_real_num_rx_queues - set actual number of RX queues used
2120  *      @dev: Network device
2121  *      @rxq: Actual number of RX queues
2122  *
2123  *      This must be called either with the rtnl_lock held or before
2124  *      registration of the net device.  Returns 0 on success, or a
2125  *      negative error code.  If called before registration, it always
2126  *      succeeds.
2127  */
2128 int netif_set_real_num_rx_queues(struct net_device *dev, unsigned int rxq)
2129 {
2130         int rc;
2131
2132         if (rxq < 1 || rxq > dev->num_rx_queues)
2133                 return -EINVAL;
2134
2135         if (dev->reg_state == NETREG_REGISTERED) {
2136                 ASSERT_RTNL();
2137
2138                 rc = net_rx_queue_update_kobjects(dev, dev->real_num_rx_queues,
2139                                                   rxq);
2140                 if (rc)
2141                         return rc;
2142         }
2143
2144         dev->real_num_rx_queues = rxq;
2145         return 0;
2146 }
2147 EXPORT_SYMBOL(netif_set_real_num_rx_queues);
2148 #endif
2149
2150 /**
2151  * netif_get_num_default_rss_queues - default number of RSS queues
2152  *
2153  * This routine should set an upper limit on the number of RSS queues
2154  * used by default by multiqueue devices.
2155  */
2156 int netif_get_num_default_rss_queues(void)
2157 {
2158         return min_t(int, DEFAULT_MAX_NUM_RSS_QUEUES, num_online_cpus());
2159 }
2160 EXPORT_SYMBOL(netif_get_num_default_rss_queues);
2161
2162 static inline void __netif_reschedule(struct Qdisc *q)
2163 {
2164         struct softnet_data *sd;
2165         unsigned long flags;
2166
2167         local_irq_save(flags);
2168         sd = &__get_cpu_var(softnet_data);
2169         q->next_sched = NULL;
2170         *sd->output_queue_tailp = q;
2171         sd->output_queue_tailp = &q->next_sched;
2172         raise_softirq_irqoff(NET_TX_SOFTIRQ);
2173         local_irq_restore(flags);
2174 }
2175
2176 void __netif_schedule(struct Qdisc *q)
2177 {
2178         if (!test_and_set_bit(__QDISC_STATE_SCHED, &q->state))
2179                 __netif_reschedule(q);
2180 }
2181 EXPORT_SYMBOL(__netif_schedule);
2182
2183 struct dev_kfree_skb_cb {
2184         enum skb_free_reason reason;
2185 };
2186
2187 static struct dev_kfree_skb_cb *get_kfree_skb_cb(const struct sk_buff *skb)
2188 {
2189         return (struct dev_kfree_skb_cb *)skb->cb;
2190 }
2191
2192 void __dev_kfree_skb_irq(struct sk_buff *skb, enum skb_free_reason reason)
2193 {
2194         unsigned long flags;
2195
2196         if (likely(atomic_read(&skb->users) == 1)) {
2197                 smp_rmb();
2198                 atomic_set(&skb->users, 0);
2199         } else if (likely(!atomic_dec_and_test(&skb->users))) {
2200                 return;
2201         }
2202         get_kfree_skb_cb(skb)->reason = reason;
2203         local_irq_save(flags);
2204         skb->next = __this_cpu_read(softnet_data.completion_queue);
2205         __this_cpu_write(softnet_data.completion_queue, skb);
2206         raise_softirq_irqoff(NET_TX_SOFTIRQ);
2207         local_irq_restore(flags);
2208 }
2209 EXPORT_SYMBOL(__dev_kfree_skb_irq);
2210
2211 void __dev_kfree_skb_any(struct sk_buff *skb, enum skb_free_reason reason)
2212 {
2213         if (in_irq() || irqs_disabled())
2214                 __dev_kfree_skb_irq(skb, reason);
2215         else
2216                 dev_kfree_skb(skb);
2217 }
2218 EXPORT_SYMBOL(__dev_kfree_skb_any);
2219
2220
2221 /**
2222  * netif_device_detach - mark device as removed
2223  * @dev: network device
2224  *
2225  * Mark device as removed from system and therefore no longer available.
2226  */
2227 void netif_device_detach(struct net_device *dev)
2228 {
2229         if (test_and_clear_bit(__LINK_STATE_PRESENT, &dev->state) &&
2230             netif_running(dev)) {
2231                 netif_tx_stop_all_queues(dev);
2232         }
2233 }
2234 EXPORT_SYMBOL(netif_device_detach);
2235
2236 /**
2237  * netif_device_attach - mark device as attached
2238  * @dev: network device
2239  *
2240  * Mark device as attached from system and restart if needed.
2241  */
2242 void netif_device_attach(struct net_device *dev)
2243 {
2244         if (!test_and_set_bit(__LINK_STATE_PRESENT, &dev->state) &&
2245             netif_running(dev)) {
2246                 netif_tx_wake_all_queues(dev);
2247                 __netdev_watchdog_up(dev);
2248         }
2249 }
2250 EXPORT_SYMBOL(netif_device_attach);
2251
2252 static void skb_warn_bad_offload(const struct sk_buff *skb)
2253 {
2254         static const netdev_features_t null_features = 0;
2255         struct net_device *dev = skb->dev;
2256         const char *driver = "";
2257
2258         if (!net_ratelimit())
2259                 return;
2260
2261         if (dev && dev->dev.parent)
2262                 driver = dev_driver_string(dev->dev.parent);
2263
2264         WARN(1, "%s: caps=(%pNF, %pNF) len=%d data_len=%d gso_size=%d "
2265              "gso_type=%d ip_summed=%d\n",
2266              driver, dev ? &dev->features : &null_features,
2267              skb->sk ? &skb->sk->sk_route_caps : &null_features,
2268              skb->len, skb->data_len, skb_shinfo(skb)->gso_size,
2269              skb_shinfo(skb)->gso_type, skb->ip_summed);
2270 }
2271
2272 /*
2273  * Invalidate hardware checksum when packet is to be mangled, and
2274  * complete checksum manually on outgoing path.
2275  */
2276 int skb_checksum_help(struct sk_buff *skb)
2277 {
2278         __wsum csum;
2279         int ret = 0, offset;
2280
2281         if (skb->ip_summed == CHECKSUM_COMPLETE)
2282                 goto out_set_summed;
2283
2284         if (unlikely(skb_shinfo(skb)->gso_size)) {
2285                 skb_warn_bad_offload(skb);
2286                 return -EINVAL;
2287         }
2288
2289         /* Before computing a checksum, we should make sure no frag could
2290          * be modified by an external entity : checksum could be wrong.
2291          */
2292         if (skb_has_shared_frag(skb)) {
2293                 ret = __skb_linearize(skb);
2294                 if (ret)
2295                         goto out;
2296         }
2297
2298         offset = skb_checksum_start_offset(skb);
2299         BUG_ON(offset >= skb_headlen(skb));
2300         csum = skb_checksum(skb, offset, skb->len - offset, 0);
2301
2302         offset += skb->csum_offset;
2303         BUG_ON(offset + sizeof(__sum16) > skb_headlen(skb));
2304
2305         if (skb_cloned(skb) &&
2306             !skb_clone_writable(skb, offset + sizeof(__sum16))) {
2307                 ret = pskb_expand_head(skb, 0, 0, GFP_ATOMIC);
2308                 if (ret)
2309                         goto out;
2310         }
2311
2312         *(__sum16 *)(skb->data + offset) = csum_fold(csum) ?: CSUM_MANGLED_0;
2313 out_set_summed:
2314         skb->ip_summed = CHECKSUM_NONE;
2315 out:
2316         return ret;
2317 }
2318 EXPORT_SYMBOL(skb_checksum_help);
2319
2320 __be16 skb_network_protocol(struct sk_buff *skb, int *depth)
2321 {
2322         unsigned int vlan_depth = skb->mac_len;
2323         __be16 type = skb->protocol;
2324
2325         /* Tunnel gso handlers can set protocol to ethernet. */
2326         if (type == htons(ETH_P_TEB)) {
2327                 struct ethhdr *eth;
2328
2329                 if (unlikely(!pskb_may_pull(skb, sizeof(struct ethhdr))))
2330                         return 0;
2331
2332                 eth = (struct ethhdr *)skb_mac_header(skb);
2333                 type = eth->h_proto;
2334         }
2335
2336         /* if skb->protocol is 802.1Q/AD then the header should already be
2337          * present at mac_len - VLAN_HLEN (if mac_len > 0), or at
2338          * ETH_HLEN otherwise
2339          */
2340         if (type == htons(ETH_P_8021Q) || type == htons(ETH_P_8021AD)) {
2341                 if (vlan_depth) {
2342                         if (unlikely(WARN_ON(vlan_depth < VLAN_HLEN)))
2343                                 return 0;
2344                         vlan_depth -= VLAN_HLEN;
2345                 } else {
2346                         vlan_depth = ETH_HLEN;
2347                 }
2348                 do {
2349                         struct vlan_hdr *vh;
2350
2351                         if (unlikely(!pskb_may_pull(skb,
2352                                                     vlan_depth + VLAN_HLEN)))
2353                                 return 0;
2354
2355                         vh = (struct vlan_hdr *)(skb->data + vlan_depth);
2356                         type = vh->h_vlan_encapsulated_proto;
2357                         vlan_depth += VLAN_HLEN;
2358                 } while (type == htons(ETH_P_8021Q) ||
2359                          type == htons(ETH_P_8021AD));
2360         }
2361
2362         *depth = vlan_depth;
2363
2364         return type;
2365 }
2366
2367 /**
2368  *      skb_mac_gso_segment - mac layer segmentation handler.
2369  *      @skb: buffer to segment
2370  *      @features: features for the output path (see dev->features)
2371  */
2372 struct sk_buff *skb_mac_gso_segment(struct sk_buff *skb,
2373                                     netdev_features_t features)
2374 {
2375         struct sk_buff *segs = ERR_PTR(-EPROTONOSUPPORT);
2376         struct packet_offload *ptype;
2377         int vlan_depth = skb->mac_len;
2378         __be16 type = skb_network_protocol(skb, &vlan_depth);
2379
2380         if (unlikely(!type))
2381                 return ERR_PTR(-EINVAL);
2382
2383         __skb_pull(skb, vlan_depth);
2384
2385         rcu_read_lock();
2386         list_for_each_entry_rcu(ptype, &offload_base, list) {
2387                 if (ptype->type == type && ptype->callbacks.gso_segment) {
2388                         if (unlikely(skb->ip_summed != CHECKSUM_PARTIAL)) {
2389                                 int err;
2390
2391                                 err = ptype->callbacks.gso_send_check(skb);
2392                                 segs = ERR_PTR(err);
2393                                 if (err || skb_gso_ok(skb, features))
2394                                         break;
2395                                 __skb_push(skb, (skb->data -
2396                                                  skb_network_header(skb)));
2397                         }
2398                         segs = ptype->callbacks.gso_segment(skb, features);
2399                         break;
2400                 }
2401         }
2402         rcu_read_unlock();
2403
2404         __skb_push(skb, skb->data - skb_mac_header(skb));
2405
2406         return segs;
2407 }
2408 EXPORT_SYMBOL(skb_mac_gso_segment);
2409
2410
2411 /* openvswitch calls this on rx path, so we need a different check.
2412  */
2413 static inline bool skb_needs_check(struct sk_buff *skb, bool tx_path)
2414 {
2415         if (tx_path)
2416                 return skb->ip_summed != CHECKSUM_PARTIAL &&
2417                        skb->ip_summed != CHECKSUM_UNNECESSARY;
2418
2419         return skb->ip_summed == CHECKSUM_NONE;
2420 }
2421
2422 /**
2423  *      __skb_gso_segment - Perform segmentation on skb.
2424  *      @skb: buffer to segment
2425  *      @features: features for the output path (see dev->features)
2426  *      @tx_path: whether it is called in TX path
2427  *
2428  *      This function segments the given skb and returns a list of segments.
2429  *
2430  *      It may return NULL if the skb requires no segmentation.  This is
2431  *      only possible when GSO is used for verifying header integrity.
2432  */
2433 struct sk_buff *__skb_gso_segment(struct sk_buff *skb,
2434                                   netdev_features_t features, bool tx_path)
2435 {
2436         struct sk_buff *segs;
2437
2438         if (unlikely(skb_needs_check(skb, tx_path))) {
2439                 int err;
2440
2441                 /* We're going to init ->check field in TCP or UDP header */
2442                 err = skb_cow_head(skb, 0);
2443                 if (err < 0)
2444                         return ERR_PTR(err);
2445         }
2446
2447         SKB_GSO_CB(skb)->mac_offset = skb_headroom(skb);
2448         SKB_GSO_CB(skb)->encap_level = 0;
2449
2450         skb_reset_mac_header(skb);
2451         skb_reset_mac_len(skb);
2452
2453         segs = skb_mac_gso_segment(skb, features);
2454
2455         if (unlikely(skb_needs_check(skb, tx_path)))
2456                 skb_warn_bad_offload(skb);
2457
2458         return segs;
2459 }
2460 EXPORT_SYMBOL(__skb_gso_segment);
2461
2462 /* Take action when hardware reception checksum errors are detected. */
2463 #ifdef CONFIG_BUG
2464 void netdev_rx_csum_fault(struct net_device *dev)
2465 {
2466         if (net_ratelimit()) {
2467                 pr_err("%s: hw csum failure\n", dev ? dev->name : "<unknown>");
2468                 dump_stack();
2469         }
2470 }
2471 EXPORT_SYMBOL(netdev_rx_csum_fault);
2472 #endif
2473
2474 /* Actually, we should eliminate this check as soon as we know, that:
2475  * 1. IOMMU is present and allows to map all the memory.
2476  * 2. No high memory really exists on this machine.
2477  */
2478
2479 static int illegal_highdma(struct net_device *dev, struct sk_buff *skb)
2480 {
2481 #ifdef CONFIG_HIGHMEM
2482         int i;
2483         if (!(dev->features & NETIF_F_HIGHDMA)) {
2484                 for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
2485                         skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
2486                         if (PageHighMem(skb_frag_page(frag)))
2487                                 return 1;
2488                 }
2489         }
2490
2491         if (PCI_DMA_BUS_IS_PHYS) {
2492                 struct device *pdev = dev->dev.parent;
2493
2494                 if (!pdev)
2495                         return 0;
2496                 for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
2497                         skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
2498                         dma_addr_t addr = page_to_phys(skb_frag_page(frag));
2499                         if (!pdev->dma_mask || addr + PAGE_SIZE - 1 > *pdev->dma_mask)
2500                                 return 1;
2501                 }
2502         }
2503 #endif
2504         return 0;
2505 }
2506
2507 struct dev_gso_cb {
2508         void (*destructor)(struct sk_buff *skb);
2509 };
2510
2511 #define DEV_GSO_CB(skb) ((struct dev_gso_cb *)(skb)->cb)
2512
2513 static void dev_gso_skb_destructor(struct sk_buff *skb)
2514 {
2515         struct dev_gso_cb *cb;
2516
2517         kfree_skb_list(skb->next);
2518         skb->next = NULL;
2519
2520         cb = DEV_GSO_CB(skb);
2521         if (cb->destructor)
2522                 cb->destructor(skb);
2523 }
2524
2525 /**
2526  *      dev_gso_segment - Perform emulated hardware segmentation on skb.
2527  *      @skb: buffer to segment
2528  *      @features: device features as applicable to this skb
2529  *
2530  *      This function segments the given skb and stores the list of segments
2531  *      in skb->next.
2532  */
2533 static int dev_gso_segment(struct sk_buff *skb, netdev_features_t features)
2534 {
2535         struct sk_buff *segs;
2536
2537         segs = skb_gso_segment(skb, features);
2538
2539         /* Verifying header integrity only. */
2540         if (!segs)
2541                 return 0;
2542
2543         if (IS_ERR(segs))
2544                 return PTR_ERR(segs);
2545
2546         skb->next = segs;
2547         DEV_GSO_CB(skb)->destructor = skb->destructor;
2548         skb->destructor = dev_gso_skb_destructor;
2549
2550         return 0;
2551 }
2552
2553 /* If MPLS offload request, verify we are testing hardware MPLS features
2554  * instead of standard features for the netdev.
2555  */
2556 #ifdef CONFIG_NET_MPLS_GSO
2557 static netdev_features_t net_mpls_features(struct sk_buff *skb,
2558                                            netdev_features_t features,
2559                                            __be16 type)
2560 {
2561         if (type == htons(ETH_P_MPLS_UC) || type == htons(ETH_P_MPLS_MC))
2562                 features &= skb->dev->mpls_features;
2563
2564         return features;
2565 }
2566 #else
2567 static netdev_features_t net_mpls_features(struct sk_buff *skb,
2568                                            netdev_features_t features,
2569                                            __be16 type)
2570 {
2571         return features;
2572 }
2573 #endif
2574
2575 static netdev_features_t harmonize_features(struct sk_buff *skb,
2576         netdev_features_t features)
2577 {
2578         int tmp;
2579         __be16 type;
2580
2581         type = skb_network_protocol(skb, &tmp);
2582         features = net_mpls_features(skb, features, type);
2583
2584         if (skb->ip_summed != CHECKSUM_NONE &&
2585             !can_checksum_protocol(features, type)) {
2586                 features &= ~NETIF_F_ALL_CSUM;
2587         }
2588         if (illegal_highdma(skb->dev, skb))
2589                 features &= ~NETIF_F_SG;
2590
2591         return features;
2592 }
2593
2594 netdev_features_t netif_skb_features(struct sk_buff *skb)
2595 {
2596         __be16 protocol = skb->protocol;
2597         netdev_features_t features = skb->dev->features;
2598
2599         if (skb_shinfo(skb)->gso_segs > skb->dev->gso_max_segs)
2600                 features &= ~NETIF_F_GSO_MASK;
2601
2602         if (!vlan_tx_tag_present(skb)) {
2603                 if (unlikely(protocol == htons(ETH_P_8021Q) ||
2604                              protocol == htons(ETH_P_8021AD))) {
2605                         struct vlan_ethhdr *veh = (struct vlan_ethhdr *)skb->data;
2606                         protocol = veh->h_vlan_encapsulated_proto;
2607                 } else {
2608                         return harmonize_features(skb, features);
2609                 }
2610         }
2611
2612         features = netdev_intersect_features(features,
2613                                              skb->dev->vlan_features |
2614                                              NETIF_F_HW_VLAN_CTAG_TX |
2615                                              NETIF_F_HW_VLAN_STAG_TX);
2616
2617         if (protocol == htons(ETH_P_8021Q) || protocol == htons(ETH_P_8021AD))
2618                 features = netdev_intersect_features(features,
2619                                                      NETIF_F_SG |
2620                                                      NETIF_F_HIGHDMA |
2621                                                      NETIF_F_FRAGLIST |
2622                                                      NETIF_F_GEN_CSUM |
2623                                                      NETIF_F_HW_VLAN_CTAG_TX |
2624                                                      NETIF_F_HW_VLAN_STAG_TX);
2625
2626         return harmonize_features(skb, features);
2627 }
2628 EXPORT_SYMBOL(netif_skb_features);
2629
2630 int dev_hard_start_xmit(struct sk_buff *skb, struct net_device *dev,
2631                         struct netdev_queue *txq)
2632 {
2633         const struct net_device_ops *ops = dev->netdev_ops;
2634         int rc = NETDEV_TX_OK;
2635         unsigned int skb_len;
2636
2637         if (likely(!skb->next)) {
2638                 netdev_features_t features;
2639
2640                 /*
2641                  * If device doesn't need skb->dst, release it right now while
2642                  * its hot in this cpu cache
2643                  */
2644                 if (dev->priv_flags & IFF_XMIT_DST_RELEASE)
2645                         skb_dst_drop(skb);
2646
2647                 features = netif_skb_features(skb);
2648
2649                 if (vlan_tx_tag_present(skb) &&
2650                     !vlan_hw_offload_capable(features, skb->vlan_proto)) {
2651                         skb = __vlan_put_tag(skb, skb->vlan_proto,
2652                                              vlan_tx_tag_get(skb));
2653                         if (unlikely(!skb))
2654                                 goto out;
2655
2656                         skb->vlan_tci = 0;
2657                 }
2658
2659                 /* If encapsulation offload request, verify we are testing
2660                  * hardware encapsulation features instead of standard
2661                  * features for the netdev
2662                  */
2663                 if (skb->encapsulation)
2664                         features &= dev->hw_enc_features;
2665
2666                 if (netif_needs_gso(skb, features)) {
2667                         if (unlikely(dev_gso_segment(skb, features)))
2668                                 goto out_kfree_skb;
2669                         if (skb->next)
2670                                 goto gso;
2671                 } else {
2672                         if (skb_needs_linearize(skb, features) &&
2673                             __skb_linearize(skb))
2674                                 goto out_kfree_skb;
2675
2676                         /* If packet is not checksummed and device does not
2677                          * support checksumming for this protocol, complete
2678                          * checksumming here.
2679                          */
2680                         if (skb->ip_summed == CHECKSUM_PARTIAL) {
2681                                 if (skb->encapsulation)
2682                                         skb_set_inner_transport_header(skb,
2683                                                 skb_checksum_start_offset(skb));
2684                                 else
2685                                         skb_set_transport_header(skb,
2686                                                 skb_checksum_start_offset(skb));
2687                                 if (!(features & NETIF_F_ALL_CSUM) &&
2688                                      skb_checksum_help(skb))
2689                                         goto out_kfree_skb;
2690                         }
2691                 }
2692
2693                 if (!list_empty(&ptype_all))
2694                         dev_queue_xmit_nit(skb, dev);
2695
2696                 skb_len = skb->len;
2697                 trace_net_dev_start_xmit(skb, dev);
2698                 rc = ops->ndo_start_xmit(skb, dev);
2699                 trace_net_dev_xmit(skb, rc, dev, skb_len);
2700                 if (rc == NETDEV_TX_OK)
2701                         txq_trans_update(txq);
2702                 return rc;
2703         }
2704
2705 gso:
2706         do {
2707                 struct sk_buff *nskb = skb->next;
2708
2709                 skb->next = nskb->next;
2710                 nskb->next = NULL;
2711
2712                 if (!list_empty(&ptype_all))
2713                         dev_queue_xmit_nit(nskb, dev);
2714
2715                 skb_len = nskb->len;
2716                 trace_net_dev_start_xmit(nskb, dev);
2717                 rc = ops->ndo_start_xmit(nskb, dev);
2718                 trace_net_dev_xmit(nskb, rc, dev, skb_len);
2719                 if (unlikely(rc != NETDEV_TX_OK)) {
2720                         if (rc & ~NETDEV_TX_MASK)
2721                                 goto out_kfree_gso_skb;
2722                         nskb->next = skb->next;
2723                         skb->next = nskb;
2724                         return rc;
2725                 }
2726                 txq_trans_update(txq);
2727                 if (unlikely(netif_xmit_stopped(txq) && skb->next))
2728                         return NETDEV_TX_BUSY;
2729         } while (skb->next);
2730
2731 out_kfree_gso_skb:
2732         if (likely(skb->next == NULL)) {
2733                 skb->destructor = DEV_GSO_CB(skb)->destructor;
2734                 consume_skb(skb);
2735                 return rc;
2736         }
2737 out_kfree_skb:
2738         kfree_skb(skb);
2739 out:
2740         return rc;
2741 }
2742 EXPORT_SYMBOL_GPL(dev_hard_start_xmit);
2743
2744 static void qdisc_pkt_len_init(struct sk_buff *skb)
2745 {
2746         const struct skb_shared_info *shinfo = skb_shinfo(skb);
2747
2748         qdisc_skb_cb(skb)->pkt_len = skb->len;
2749
2750         /* To get more precise estimation of bytes sent on wire,
2751          * we add to pkt_len the headers size of all segments
2752          */
2753         if (shinfo->gso_size)  {
2754                 unsigned int hdr_len;
2755                 u16 gso_segs = shinfo->gso_segs;
2756
2757                 /* mac layer + network layer */
2758                 hdr_len = skb_transport_header(skb) - skb_mac_header(skb);
2759
2760                 /* + transport layer */
2761                 if (likely(shinfo->gso_type & (SKB_GSO_TCPV4 | SKB_GSO_TCPV6)))
2762                         hdr_len += tcp_hdrlen(skb);
2763                 else
2764                         hdr_len += sizeof(struct udphdr);
2765
2766                 if (shinfo->gso_type & SKB_GSO_DODGY)
2767                         gso_segs = DIV_ROUND_UP(skb->len - hdr_len,
2768                                                 shinfo->gso_size);
2769
2770                 qdisc_skb_cb(skb)->pkt_len += (gso_segs - 1) * hdr_len;
2771         }
2772 }
2773
2774 static inline int __dev_xmit_skb(struct sk_buff *skb, struct Qdisc *q,
2775                                  struct net_device *dev,
2776                                  struct netdev_queue *txq)
2777 {
2778         spinlock_t *root_lock = qdisc_lock(q);
2779         bool contended;
2780         int rc;
2781
2782         qdisc_pkt_len_init(skb);
2783         qdisc_calculate_pkt_len(skb, q);
2784         /*
2785          * Heuristic to force contended enqueues to serialize on a
2786          * separate lock before trying to get qdisc main lock.
2787          * This permits __QDISC_STATE_RUNNING owner to get the lock more often
2788          * and dequeue packets faster.
2789          */
2790         contended = qdisc_is_running(q);
2791         if (unlikely(contended))
2792                 spin_lock(&q->busylock);
2793
2794         spin_lock(root_lock);
2795         if (unlikely(test_bit(__QDISC_STATE_DEACTIVATED, &q->state))) {
2796                 kfree_skb(skb);
2797                 rc = NET_XMIT_DROP;
2798         } else if ((q->flags & TCQ_F_CAN_BYPASS) && !qdisc_qlen(q) &&
2799                    qdisc_run_begin(q)) {
2800                 /*
2801                  * This is a work-conserving queue; there are no old skbs
2802                  * waiting to be sent out; and the qdisc is not running -
2803                  * xmit the skb directly.
2804                  */
2805                 if (!(dev->priv_flags & IFF_XMIT_DST_RELEASE))
2806                         skb_dst_force(skb);
2807
2808                 qdisc_bstats_update(q, skb);
2809
2810                 if (sch_direct_xmit(skb, q, dev, txq, root_lock)) {
2811                         if (unlikely(contended)) {
2812                                 spin_unlock(&q->busylock);
2813                                 contended = false;
2814                         }
2815                         __qdisc_run(q);
2816                 } else
2817                         qdisc_run_end(q);
2818
2819                 rc = NET_XMIT_SUCCESS;
2820         } else {
2821                 skb_dst_force(skb);
2822                 rc = q->enqueue(skb, q) & NET_XMIT_MASK;
2823                 if (qdisc_run_begin(q)) {
2824                         if (unlikely(contended)) {
2825                                 spin_unlock(&q->busylock);
2826                                 contended = false;
2827                         }
2828                         __qdisc_run(q);
2829                 }
2830         }
2831         spin_unlock(root_lock);
2832         if (unlikely(contended))
2833                 spin_unlock(&q->busylock);
2834         return rc;
2835 }
2836
2837 #if IS_ENABLED(CONFIG_CGROUP_NET_PRIO)
2838 static void skb_update_prio(struct sk_buff *skb)
2839 {
2840         struct netprio_map *map = rcu_dereference_bh(skb->dev->priomap);
2841
2842         if (!skb->priority && skb->sk && map) {
2843                 unsigned int prioidx = skb->sk->sk_cgrp_prioidx;
2844
2845                 if (prioidx < map->priomap_len)
2846                         skb->priority = map->priomap[prioidx];
2847         }
2848 }
2849 #else
2850 #define skb_update_prio(skb)
2851 #endif
2852
2853 DEFINE_PER_CPU(int, xmit_recursion);
2854 EXPORT_SYMBOL(xmit_recursion);
2855
2856 #define RECURSION_LIMIT 10
2857
2858 /**
2859  *      dev_loopback_xmit - loop back @skb
2860  *      @skb: buffer to transmit
2861  */
2862 int dev_loopback_xmit(struct sk_buff *skb)
2863 {
2864         skb_reset_mac_header(skb);
2865         __skb_pull(skb, skb_network_offset(skb));
2866         skb->pkt_type = PACKET_LOOPBACK;
2867         skb->ip_summed = CHECKSUM_UNNECESSARY;
2868         WARN_ON(!skb_dst(skb));
2869         skb_dst_force(skb);
2870         netif_rx_ni(skb);
2871         return 0;
2872 }
2873 EXPORT_SYMBOL(dev_loopback_xmit);
2874
2875 /**
2876  *      __dev_queue_xmit - transmit a buffer
2877  *      @skb: buffer to transmit
2878  *      @accel_priv: private data used for L2 forwarding offload
2879  *
2880  *      Queue a buffer for transmission to a network device. The caller must
2881  *      have set the device and priority and built the buffer before calling
2882  *      this function. The function can be called from an interrupt.
2883  *
2884  *      A negative errno code is returned on a failure. A success does not
2885  *      guarantee the frame will be transmitted as it may be dropped due
2886  *      to congestion or traffic shaping.
2887  *
2888  * -----------------------------------------------------------------------------------
2889  *      I notice this method can also return errors from the queue disciplines,
2890  *      including NET_XMIT_DROP, which is a positive value.  So, errors can also
2891  *      be positive.
2892  *
2893  *      Regardless of the return value, the skb is consumed, so it is currently
2894  *      difficult to retry a send to this method.  (You can bump the ref count
2895  *      before sending to hold a reference for retry if you are careful.)
2896  *
2897  *      When calling this method, interrupts MUST be enabled.  This is because
2898  *      the BH enable code must have IRQs enabled so that it will not deadlock.
2899  *          --BLG
2900  */
2901 static int __dev_queue_xmit(struct sk_buff *skb, void *accel_priv)
2902 {
2903         struct net_device *dev = skb->dev;
2904         struct netdev_queue *txq;
2905         struct Qdisc *q;
2906         int rc = -ENOMEM;
2907
2908         skb_reset_mac_header(skb);
2909
2910         /* Disable soft irqs for various locks below. Also
2911          * stops preemption for RCU.
2912          */
2913         rcu_read_lock_bh();
2914
2915         skb_update_prio(skb);
2916
2917         txq = netdev_pick_tx(dev, skb, accel_priv);
2918         q = rcu_dereference_bh(txq->qdisc);
2919
2920 #ifdef CONFIG_NET_CLS_ACT
2921         skb->tc_verd = SET_TC_AT(skb->tc_verd, AT_EGRESS);
2922 #endif
2923         trace_net_dev_queue(skb);
2924         if (q->enqueue) {
2925                 rc = __dev_xmit_skb(skb, q, dev, txq);
2926                 goto out;
2927         }
2928
2929         /* The device has no queue. Common case for software devices:
2930            loopback, all the sorts of tunnels...
2931
2932            Really, it is unlikely that netif_tx_lock protection is necessary
2933            here.  (f.e. loopback and IP tunnels are clean ignoring statistics
2934            counters.)
2935            However, it is possible, that they rely on protection
2936            made by us here.
2937
2938            Check this and shot the lock. It is not prone from deadlocks.
2939            Either shot noqueue qdisc, it is even simpler 8)
2940          */
2941         if (dev->flags & IFF_UP) {
2942                 int cpu = smp_processor_id(); /* ok because BHs are off */
2943
2944                 if (txq->xmit_lock_owner != cpu) {
2945
2946                         if (__this_cpu_read(xmit_recursion) > RECURSION_LIMIT)
2947                                 goto recursion_alert;
2948
2949                         HARD_TX_LOCK(dev, txq, cpu);
2950
2951                         if (!netif_xmit_stopped(txq)) {
2952                                 __this_cpu_inc(xmit_recursion);
2953                                 rc = dev_hard_start_xmit(skb, dev, txq);
2954                                 __this_cpu_dec(xmit_recursion);
2955                                 if (dev_xmit_complete(rc)) {
2956                                         HARD_TX_UNLOCK(dev, txq);
2957                                         goto out;
2958                                 }
2959                         }
2960                         HARD_TX_UNLOCK(dev, txq);
2961                         net_crit_ratelimited("Virtual device %s asks to queue packet!\n",
2962                                              dev->name);
2963                 } else {
2964                         /* Recursion is detected! It is possible,
2965                          * unfortunately
2966                          */
2967 recursion_alert:
2968                         net_crit_ratelimited("Dead loop on virtual device %s, fix it urgently!\n",
2969                                              dev->name);
2970                 }
2971         }
2972
2973         rc = -ENETDOWN;
2974         rcu_read_unlock_bh();
2975
2976         atomic_long_inc(&dev->tx_dropped);
2977         kfree_skb(skb);
2978         return rc;
2979 out:
2980         rcu_read_unlock_bh();
2981         return rc;
2982 }
2983
2984 int dev_queue_xmit(struct sk_buff *skb)
2985 {
2986         return __dev_queue_xmit(skb, NULL);
2987 }
2988 EXPORT_SYMBOL(dev_queue_xmit);
2989
2990 int dev_queue_xmit_accel(struct sk_buff *skb, void *accel_priv)
2991 {
2992         return __dev_queue_xmit(skb, accel_priv);
2993 }
2994 EXPORT_SYMBOL(dev_queue_xmit_accel);
2995
2996
2997 /*=======================================================================
2998                         Receiver routines
2999   =======================================================================*/
3000
3001 int netdev_max_backlog __read_mostly = 1000;
3002 EXPORT_SYMBOL(netdev_max_backlog);
3003
3004 int netdev_tstamp_prequeue __read_mostly = 1;
3005 int netdev_budget __read_mostly = 300;
3006 int weight_p __read_mostly = 64;            /* old backlog weight */
3007
3008 /* Called with irq disabled */
3009 static inline void ____napi_schedule(struct softnet_data *sd,
3010                                      struct napi_struct *napi)
3011 {
3012         list_add_tail(&napi->poll_list, &sd->poll_list);
3013         __raise_softirq_irqoff(NET_RX_SOFTIRQ);
3014 }
3015
3016 #ifdef CONFIG_RPS
3017
3018 /* One global table that all flow-based protocols share. */
3019 struct rps_sock_flow_table __rcu *rps_sock_flow_table __read_mostly;
3020 EXPORT_SYMBOL(rps_sock_flow_table);
3021
3022 struct static_key rps_needed __read_mostly;
3023
3024 static struct rps_dev_flow *
3025 set_rps_cpu(struct net_device *dev, struct sk_buff *skb,
3026             struct rps_dev_flow *rflow, u16 next_cpu)
3027 {
3028         if (next_cpu != RPS_NO_CPU) {
3029 #ifdef CONFIG_RFS_ACCEL
3030                 struct netdev_rx_queue *rxqueue;
3031                 struct rps_dev_flow_table *flow_table;
3032                 struct rps_dev_flow *old_rflow;
3033                 u32 flow_id;
3034                 u16 rxq_index;
3035                 int rc;
3036
3037                 /* Should we steer this flow to a different hardware queue? */
3038                 if (!skb_rx_queue_recorded(skb) || !dev->rx_cpu_rmap ||
3039                     !(dev->features & NETIF_F_NTUPLE))
3040                         goto out;
3041                 rxq_index = cpu_rmap_lookup_index(dev->rx_cpu_rmap, next_cpu);
3042                 if (rxq_index == skb_get_rx_queue(skb))
3043                         goto out;
3044
3045                 rxqueue = dev->_rx + rxq_index;
3046                 flow_table = rcu_dereference(rxqueue->rps_flow_table);
3047                 if (!flow_table)
3048                         goto out;
3049                 flow_id = skb_get_hash(skb) & flow_table->mask;
3050                 rc = dev->netdev_ops->ndo_rx_flow_steer(dev, skb,
3051                                                         rxq_index, flow_id);
3052                 if (rc < 0)
3053                         goto out;
3054                 old_rflow = rflow;
3055                 rflow = &flow_table->flows[flow_id];
3056                 rflow->filter = rc;
3057                 if (old_rflow->filter == rflow->filter)
3058                         old_rflow->filter = RPS_NO_FILTER;
3059         out:
3060 #endif
3061                 rflow->last_qtail =
3062                         per_cpu(softnet_data, next_cpu).input_queue_head;
3063         }
3064
3065         rflow->cpu = next_cpu;
3066         return rflow;
3067 }
3068
3069 /*
3070  * get_rps_cpu is called from netif_receive_skb and returns the target
3071  * CPU from the RPS map of the receiving queue for a given skb.
3072  * rcu_read_lock must be held on entry.
3073  */
3074 static int get_rps_cpu(struct net_device *dev, struct sk_buff *skb,
3075                        struct rps_dev_flow **rflowp)
3076 {
3077         struct netdev_rx_queue *rxqueue;
3078         struct rps_map *map;
3079         struct rps_dev_flow_table *flow_table;
3080         struct rps_sock_flow_table *sock_flow_table;
3081         int cpu = -1;
3082         u16 tcpu;
3083         u32 hash;
3084
3085         if (skb_rx_queue_recorded(skb)) {
3086                 u16 index = skb_get_rx_queue(skb);
3087                 if (unlikely(index >= dev->real_num_rx_queues)) {
3088                         WARN_ONCE(dev->real_num_rx_queues > 1,
3089                                   "%s received packet on queue %u, but number "
3090                                   "of RX queues is %u\n",
3091                                   dev->name, index, dev->real_num_rx_queues);
3092                         goto done;
3093                 }
3094                 rxqueue = dev->_rx + index;
3095         } else
3096                 rxqueue = dev->_rx;
3097
3098         map = rcu_dereference(rxqueue->rps_map);
3099         if (map) {
3100                 if (map->len == 1 &&
3101                     !rcu_access_pointer(rxqueue->rps_flow_table)) {
3102                         tcpu = map->cpus[0];
3103                         if (cpu_online(tcpu))
3104                                 cpu = tcpu;
3105                         goto done;
3106                 }
3107         } else if (!rcu_access_pointer(rxqueue->rps_flow_table)) {
3108                 goto done;
3109         }
3110
3111         skb_reset_network_header(skb);
3112         hash = skb_get_hash(skb);
3113         if (!hash)
3114                 goto done;
3115
3116         flow_table = rcu_dereference(rxqueue->rps_flow_table);
3117         sock_flow_table = rcu_dereference(rps_sock_flow_table);
3118         if (flow_table && sock_flow_table) {
3119                 u16 next_cpu;
3120                 struct rps_dev_flow *rflow;
3121
3122                 rflow = &flow_table->flows[hash & flow_table->mask];
3123                 tcpu = rflow->cpu;
3124
3125                 next_cpu = sock_flow_table->ents[hash & sock_flow_table->mask];
3126
3127                 /*
3128                  * If the desired CPU (where last recvmsg was done) is
3129                  * different from current CPU (one in the rx-queue flow
3130                  * table entry), switch if one of the following holds:
3131                  *   - Current CPU is unset (equal to RPS_NO_CPU).
3132                  *   - Current CPU is offline.
3133                  *   - The current CPU's queue tail has advanced beyond the
3134                  *     last packet that was enqueued using this table entry.
3135                  *     This guarantees that all previous packets for the flow
3136                  *     have been dequeued, thus preserving in order delivery.
3137                  */
3138                 if (unlikely(tcpu != next_cpu) &&
3139                     (tcpu == RPS_NO_CPU || !cpu_online(tcpu) ||
3140                      ((int)(per_cpu(softnet_data, tcpu).input_queue_head -
3141                       rflow->last_qtail)) >= 0)) {
3142                         tcpu = next_cpu;
3143                         rflow = set_rps_cpu(dev, skb, rflow, next_cpu);
3144                 }
3145
3146                 if (tcpu != RPS_NO_CPU && cpu_online(tcpu)) {
3147                         *rflowp = rflow;
3148                         cpu = tcpu;
3149                         goto done;
3150                 }
3151         }
3152
3153         if (map) {
3154                 tcpu = map->cpus[((u64) hash * map->len) >> 32];
3155
3156                 if (cpu_online(tcpu)) {
3157                         cpu = tcpu;
3158                         goto done;
3159                 }
3160         }
3161
3162 done:
3163         return cpu;
3164 }
3165
3166 #ifdef CONFIG_RFS_ACCEL
3167
3168 /**
3169  * rps_may_expire_flow - check whether an RFS hardware filter may be removed
3170  * @dev: Device on which the filter was set
3171  * @rxq_index: RX queue index
3172  * @flow_id: Flow ID passed to ndo_rx_flow_steer()
3173  * @filter_id: Filter ID returned by ndo_rx_flow_steer()
3174  *
3175  * Drivers that implement ndo_rx_flow_steer() should periodically call
3176  * this function for each installed filter and remove the filters for
3177  * which it returns %true.
3178  */
3179 bool rps_may_expire_flow(struct net_device *dev, u16 rxq_index,
3180                          u32 flow_id, u16 filter_id)
3181 {
3182         struct netdev_rx_queue *rxqueue = dev->_rx + rxq_index;
3183         struct rps_dev_flow_table *flow_table;
3184         struct rps_dev_flow *rflow;
3185         bool expire = true;
3186         int cpu;
3187
3188         rcu_read_lock();
3189         flow_table = rcu_dereference(rxqueue->rps_flow_table);
3190         if (flow_table && flow_id <= flow_table->mask) {
3191                 rflow = &flow_table->flows[flow_id];
3192                 cpu = ACCESS_ONCE(rflow->cpu);
3193                 if (rflow->filter == filter_id && cpu != RPS_NO_CPU &&
3194                     ((int)(per_cpu(softnet_data, cpu).input_queue_head -
3195                            rflow->last_qtail) <
3196                      (int)(10 * flow_table->mask)))
3197                         expire = false;
3198         }
3199         rcu_read_unlock();
3200         return expire;
3201 }
3202 EXPORT_SYMBOL(rps_may_expire_flow);
3203
3204 #endif /* CONFIG_RFS_ACCEL */
3205
3206 /* Called from hardirq (IPI) context */
3207 static void rps_trigger_softirq(void *data)
3208 {
3209         struct softnet_data *sd = data;
3210
3211         ____napi_schedule(sd, &sd->backlog);
3212         sd->received_rps++;
3213 }
3214
3215 #endif /* CONFIG_RPS */
3216
3217 /*
3218  * Check if this softnet_data structure is another cpu one
3219  * If yes, queue it to our IPI list and return 1
3220  * If no, return 0
3221  */
3222 static int rps_ipi_queued(struct softnet_data *sd)
3223 {
3224 #ifdef CONFIG_RPS
3225         struct softnet_data *mysd = &__get_cpu_var(softnet_data);
3226
3227         if (sd != mysd) {
3228                 sd->rps_ipi_next = mysd->rps_ipi_list;
3229                 mysd->rps_ipi_list = sd;
3230
3231                 __raise_softirq_irqoff(NET_RX_SOFTIRQ);
3232                 return 1;
3233         }
3234 #endif /* CONFIG_RPS */
3235         return 0;
3236 }
3237
3238 #ifdef CONFIG_NET_FLOW_LIMIT
3239 int netdev_flow_limit_table_len __read_mostly = (1 << 12);
3240 #endif
3241
3242 static bool skb_flow_limit(struct sk_buff *skb, unsigned int qlen)
3243 {
3244 #ifdef CONFIG_NET_FLOW_LIMIT
3245         struct sd_flow_limit *fl;
3246         struct softnet_data *sd;
3247         unsigned int old_flow, new_flow;
3248
3249         if (qlen < (netdev_max_backlog >> 1))
3250                 return false;
3251
3252         sd = &__get_cpu_var(softnet_data);
3253
3254         rcu_read_lock();
3255         fl = rcu_dereference(sd->flow_limit);
3256         if (fl) {
3257                 new_flow = skb_get_hash(skb) & (fl->num_buckets - 1);
3258                 old_flow = fl->history[fl->history_head];
3259                 fl->history[fl->history_head] = new_flow;
3260
3261                 fl->history_head++;
3262                 fl->history_head &= FLOW_LIMIT_HISTORY - 1;
3263
3264                 if (likely(fl->buckets[old_flow]))
3265                         fl->buckets[old_flow]--;
3266
3267                 if (++fl->buckets[new_flow] > (FLOW_LIMIT_HISTORY >> 1)) {
3268                         fl->count++;
3269                         rcu_read_unlock();
3270                         return true;
3271                 }
3272         }
3273         rcu_read_unlock();
3274 #endif
3275         return false;
3276 }
3277
3278 /*
3279  * enqueue_to_backlog is called to queue an skb to a per CPU backlog
3280  * queue (may be a remote CPU queue).
3281  */
3282 static int enqueue_to_backlog(struct sk_buff *skb, int cpu,
3283                               unsigned int *qtail)
3284 {
3285         struct softnet_data *sd;
3286         unsigned long flags;
3287         unsigned int qlen;
3288
3289         sd = &per_cpu(softnet_data, cpu);
3290
3291         local_irq_save(flags);
3292
3293         rps_lock(sd);
3294         if (!netif_running(skb->dev))
3295                 goto drop;
3296         qlen = skb_queue_len(&sd->input_pkt_queue);
3297         if (qlen <= netdev_max_backlog && !skb_flow_limit(skb, qlen)) {
3298                 if (skb_queue_len(&sd->input_pkt_queue)) {
3299 enqueue:
3300                         __skb_queue_tail(&sd->input_pkt_queue, skb);
3301                         input_queue_tail_incr_save(sd, qtail);
3302                         rps_unlock(sd);
3303                         local_irq_restore(flags);
3304                         return NET_RX_SUCCESS;
3305                 }
3306
3307                 /* Schedule NAPI for backlog device
3308                  * We can use non atomic operation since we own the queue lock
3309                  */
3310                 if (!__test_and_set_bit(NAPI_STATE_SCHED, &sd->backlog.state)) {
3311                         if (!rps_ipi_queued(sd))
3312                                 ____napi_schedule(sd, &sd->backlog);
3313                 }
3314                 goto enqueue;
3315         }
3316
3317 drop:
3318         sd->dropped++;
3319         rps_unlock(sd);
3320
3321         local_irq_restore(flags);
3322
3323         atomic_long_inc(&skb->dev->rx_dropped);
3324         kfree_skb(skb);
3325         return NET_RX_DROP;
3326 }
3327
3328 static int netif_rx_internal(struct sk_buff *skb)
3329 {
3330         int ret;
3331
3332         net_timestamp_check(netdev_tstamp_prequeue, skb);
3333
3334         trace_netif_rx(skb);
3335 #ifdef CONFIG_RPS
3336         if (static_key_false(&rps_needed)) {
3337                 struct rps_dev_flow voidflow, *rflow = &voidflow;
3338                 int cpu;
3339
3340                 preempt_disable();
3341                 rcu_read_lock();
3342
3343                 cpu = get_rps_cpu(skb->dev, skb, &rflow);
3344                 if (cpu < 0)
3345                         cpu = smp_processor_id();
3346
3347                 ret = enqueue_to_backlog(skb, cpu, &rflow->last_qtail);
3348
3349                 rcu_read_unlock();
3350                 preempt_enable();
3351         } else
3352 #endif
3353         {
3354                 unsigned int qtail;
3355                 ret = enqueue_to_backlog(skb, get_cpu(), &qtail);
3356                 put_cpu();
3357         }
3358         return ret;
3359 }
3360
3361 /**
3362  *      netif_rx        -       post buffer to the network code
3363  *      @skb: buffer to post
3364  *
3365  *      This function receives a packet from a device driver and queues it for
3366  *      the upper (protocol) levels to process.  It always succeeds. The buffer
3367  *      may be dropped during processing for congestion control or by the
3368  *      protocol layers.
3369  *
3370  *      return values:
3371  *      NET_RX_SUCCESS  (no congestion)
3372  *      NET_RX_DROP     (packet was dropped)
3373  *
3374  */
3375
3376 int netif_rx(struct sk_buff *skb)
3377 {
3378         trace_netif_rx_entry(skb);
3379
3380         return netif_rx_internal(skb);
3381 }
3382 EXPORT_SYMBOL(netif_rx);
3383
3384 int netif_rx_ni(struct sk_buff *skb)
3385 {
3386         int err;
3387
3388         trace_netif_rx_ni_entry(skb);
3389
3390         preempt_disable();
3391         err = netif_rx_internal(skb);
3392         if (local_softirq_pending())
3393                 do_softirq();
3394         preempt_enable();
3395
3396         return err;
3397 }
3398 EXPORT_SYMBOL(netif_rx_ni);
3399
3400 static void net_tx_action(struct softirq_action *h)
3401 {
3402         struct softnet_data *sd = &__get_cpu_var(softnet_data);
3403
3404         if (sd->completion_queue) {
3405                 struct sk_buff *clist;
3406
3407                 local_irq_disable();
3408                 clist = sd->completion_queue;
3409                 sd->completion_queue = NULL;
3410                 local_irq_enable();
3411
3412                 while (clist) {
3413                         struct sk_buff *skb = clist;
3414                         clist = clist->next;
3415
3416                         WARN_ON(atomic_read(&skb->users));
3417                         if (likely(get_kfree_skb_cb(skb)->reason == SKB_REASON_CONSUMED))
3418                                 trace_consume_skb(skb);
3419                         else
3420                                 trace_kfree_skb(skb, net_tx_action);
3421                         __kfree_skb(skb);
3422                 }
3423         }
3424
3425         if (sd->output_queue) {
3426                 struct Qdisc *head;
3427
3428                 local_irq_disable();
3429                 head = sd->output_queue;
3430                 sd->output_queue = NULL;
3431                 sd->output_queue_tailp = &sd->output_queue;
3432                 local_irq_enable();
3433
3434                 while (head) {
3435                         struct Qdisc *q = head;
3436                         spinlock_t *root_lock;
3437
3438                         head = head->next_sched;
3439
3440                         root_lock = qdisc_lock(q);
3441                         if (spin_trylock(root_lock)) {
3442                                 smp_mb__before_atomic();
3443                                 clear_bit(__QDISC_STATE_SCHED,
3444                                           &q->state);
3445                                 qdisc_run(q);
3446                                 spin_unlock(root_lock);
3447                         } else {
3448                                 if (!test_bit(__QDISC_STATE_DEACTIVATED,
3449                                               &q->state)) {
3450                                         __netif_reschedule(q);
3451                                 } else {
3452                                         smp_mb__before_atomic();
3453                                         clear_bit(__QDISC_STATE_SCHED,
3454                                                   &q->state);
3455                                 }
3456                         }
3457                 }
3458         }
3459 }
3460
3461 #if (defined(CONFIG_BRIDGE) || defined(CONFIG_BRIDGE_MODULE)) && \
3462     (defined(CONFIG_ATM_LANE) || defined(CONFIG_ATM_LANE_MODULE))
3463 /* This hook is defined here for ATM LANE */
3464 int (*br_fdb_test_addr_hook)(struct net_device *dev,
3465                              unsigned char *addr) __read_mostly;
3466 EXPORT_SYMBOL_GPL(br_fdb_test_addr_hook);
3467 #endif
3468
3469 #ifdef CONFIG_NET_CLS_ACT
3470 /* TODO: Maybe we should just force sch_ingress to be compiled in
3471  * when CONFIG_NET_CLS_ACT is? otherwise some useless instructions
3472  * a compare and 2 stores extra right now if we dont have it on
3473  * but have CONFIG_NET_CLS_ACT
3474  * NOTE: This doesn't stop any functionality; if you dont have
3475  * the ingress scheduler, you just can't add policies on ingress.
3476  *
3477  */
3478 static int ing_filter(struct sk_buff *skb, struct netdev_queue *rxq)
3479 {
3480         struct net_device *dev = skb->dev;
3481         u32 ttl = G_TC_RTTL(skb->tc_verd);
3482         int result = TC_ACT_OK;
3483         struct Qdisc *q;
3484
3485         if (unlikely(MAX_RED_LOOP < ttl++)) {
3486                 net_warn_ratelimited("Redir loop detected Dropping packet (%d->%d)\n",
3487                                      skb->skb_iif, dev->ifindex);
3488                 return TC_ACT_SHOT;
3489         }
3490
3491         skb->tc_verd = SET_TC_RTTL(skb->tc_verd, ttl);
3492         skb->tc_verd = SET_TC_AT(skb->tc_verd, AT_INGRESS);
3493
3494         q = rxq->qdisc;
3495         if (q != &noop_qdisc) {
3496                 spin_lock(qdisc_lock(q));
3497                 if (likely(!test_bit(__QDISC_STATE_DEACTIVATED, &q->state)))
3498                         result = qdisc_enqueue_root(skb, q);
3499                 spin_unlock(qdisc_lock(q));
3500         }
3501
3502         return result;
3503 }
3504
3505 static inline struct sk_buff *handle_ing(struct sk_buff *skb,
3506                                          struct packet_type **pt_prev,
3507                                          int *ret, struct net_device *orig_dev)
3508 {
3509         struct netdev_queue *rxq = rcu_dereference(skb->dev->ingress_queue);
3510
3511         if (!rxq || rxq->qdisc == &noop_qdisc)
3512                 goto out;
3513
3514         if (*pt_prev) {
3515                 *ret = deliver_skb(skb, *pt_prev, orig_dev);
3516                 *pt_prev = NULL;
3517         }
3518
3519         switch (ing_filter(skb, rxq)) {
3520         case TC_ACT_SHOT:
3521         case TC_ACT_STOLEN:
3522                 kfree_skb(skb);
3523                 return NULL;
3524         }
3525
3526 out:
3527         skb->tc_verd = 0;
3528         return skb;
3529 }
3530 #endif
3531
3532 /**
3533  *      netdev_rx_handler_register - register receive handler
3534  *      @dev: device to register a handler for
3535  *      @rx_handler: receive handler to register
3536  *      @rx_handler_data: data pointer that is used by rx handler
3537  *
3538  *      Register a receive handler for a device. This handler will then be
3539  *      called from __netif_receive_skb. A negative errno code is returned
3540  *      on a failure.
3541  *
3542  *      The caller must hold the rtnl_mutex.
3543  *
3544  *      For a general description of rx_handler, see enum rx_handler_result.
3545  */
3546 int netdev_rx_handler_register(struct net_device *dev,
3547                                rx_handler_func_t *rx_handler,
3548                                void *rx_handler_data)
3549 {
3550         ASSERT_RTNL();
3551
3552         if (dev->rx_handler)
3553                 return -EBUSY;
3554
3555         /* Note: rx_handler_data must be set before rx_handler */
3556         rcu_assign_pointer(dev->rx_handler_data, rx_handler_data);
3557         rcu_assign_pointer(dev->rx_handler, rx_handler);
3558
3559         return 0;
3560 }
3561 EXPORT_SYMBOL_GPL(netdev_rx_handler_register);
3562
3563 /**
3564  *      netdev_rx_handler_unregister - unregister receive handler
3565  *      @dev: device to unregister a handler from
3566  *
3567  *      Unregister a receive handler from a device.
3568  *
3569  *      The caller must hold the rtnl_mutex.
3570  */
3571 void netdev_rx_handler_unregister(struct net_device *dev)
3572 {
3573
3574         ASSERT_RTNL();
3575         RCU_INIT_POINTER(dev->rx_handler, NULL);
3576         /* a reader seeing a non NULL rx_handler in a rcu_read_lock()
3577          * section has a guarantee to see a non NULL rx_handler_data
3578          * as well.
3579          */
3580         synchronize_net();
3581         RCU_INIT_POINTER(dev->rx_handler_data, NULL);
3582 }
3583 EXPORT_SYMBOL_GPL(netdev_rx_handler_unregister);
3584
3585 /*
3586  * Limit the use of PFMEMALLOC reserves to those protocols that implement
3587  * the special handling of PFMEMALLOC skbs.
3588  */
3589 static bool skb_pfmemalloc_protocol(struct sk_buff *skb)
3590 {
3591         switch (skb->protocol) {
3592         case htons(ETH_P_ARP):
3593         case htons(ETH_P_IP):
3594         case htons(ETH_P_IPV6):
3595         case htons(ETH_P_8021Q):
3596         case htons(ETH_P_8021AD):
3597                 return true;
3598         default:
3599                 return false;
3600         }
3601 }
3602
3603 static int __netif_receive_skb_core(struct sk_buff *skb, bool pfmemalloc)
3604 {
3605         struct packet_type *ptype, *pt_prev;
3606         rx_handler_func_t *rx_handler;
3607         struct net_device *orig_dev;
3608         struct net_device *null_or_dev;
3609         bool deliver_exact = false;
3610         int ret = NET_RX_DROP;
3611         __be16 type;
3612
3613         net_timestamp_check(!netdev_tstamp_prequeue, skb);
3614
3615         trace_netif_receive_skb(skb);
3616
3617         orig_dev = skb->dev;
3618
3619         skb_reset_network_header(skb);
3620         if (!skb_transport_header_was_set(skb))
3621                 skb_reset_transport_header(skb);
3622         skb_reset_mac_len(skb);
3623
3624         pt_prev = NULL;
3625
3626 another_round:
3627         skb->skb_iif = skb->dev->ifindex;
3628
3629         __this_cpu_inc(softnet_data.processed);
3630
3631         if (skb->protocol == cpu_to_be16(ETH_P_8021Q) ||
3632             skb->protocol == cpu_to_be16(ETH_P_8021AD)) {
3633                 skb = skb_vlan_untag(skb);
3634                 if (unlikely(!skb))
3635                         goto out;
3636         }
3637
3638 #ifdef CONFIG_NET_CLS_ACT
3639         if (skb->tc_verd & TC_NCLS) {
3640                 skb->tc_verd = CLR_TC_NCLS(skb->tc_verd);
3641                 goto ncls;
3642         }
3643 #endif
3644
3645         if (pfmemalloc)
3646                 goto skip_taps;
3647
3648         list_for_each_entry_rcu(ptype, &ptype_all, list) {
3649                 if (!ptype->dev || ptype->dev == skb->dev) {
3650                         if (pt_prev)
3651                                 ret = deliver_skb(skb, pt_prev, orig_dev);
3652                         pt_prev = ptype;
3653                 }
3654         }
3655
3656 skip_taps:
3657 #ifdef CONFIG_NET_CLS_ACT
3658         skb = handle_ing(skb, &pt_prev, &ret, orig_dev);
3659         if (!skb)
3660                 goto out;
3661 ncls:
3662 #endif
3663
3664         if (pfmemalloc && !skb_pfmemalloc_protocol(skb))
3665                 goto drop;
3666
3667         if (vlan_tx_tag_present(skb)) {
3668                 if (pt_prev) {
3669                         ret = deliver_skb(skb, pt_prev, orig_dev);
3670                         pt_prev = NULL;
3671                 }
3672                 if (vlan_do_receive(&skb))
3673                         goto another_round;
3674                 else if (unlikely(!skb))
3675                         goto out;
3676         }
3677
3678         rx_handler = rcu_dereference(skb->dev->rx_handler);
3679         if (rx_handler) {
3680                 if (pt_prev) {
3681                         ret = deliver_skb(skb, pt_prev, orig_dev);
3682                         pt_prev = NULL;
3683                 }
3684                 switch (rx_handler(&skb)) {
3685                 case RX_HANDLER_CONSUMED:
3686                         ret = NET_RX_SUCCESS;
3687                         goto out;
3688                 case RX_HANDLER_ANOTHER:
3689                         goto another_round;
3690                 case RX_HANDLER_EXACT:
3691                         deliver_exact = true;
3692                 case RX_HANDLER_PASS:
3693                         break;
3694                 default:
3695                         BUG();
3696                 }
3697         }
3698
3699         if (unlikely(vlan_tx_tag_present(skb))) {
3700                 if (vlan_tx_tag_get_id(skb))
3701                         skb->pkt_type = PACKET_OTHERHOST;
3702                 /* Note: we might in the future use prio bits
3703                  * and set skb->priority like in vlan_do_receive()
3704                  * For the time being, just ignore Priority Code Point
3705                  */
3706                 skb->vlan_tci = 0;
3707         }
3708
3709         /* deliver only exact match when indicated */
3710         null_or_dev = deliver_exact ? skb->dev : NULL;
3711
3712         type = skb->protocol;
3713         list_for_each_entry_rcu(ptype,
3714                         &ptype_base[ntohs(type) & PTYPE_HASH_MASK], list) {
3715                 if (ptype->type == type &&
3716                     (ptype->dev == null_or_dev || ptype->dev == skb->dev ||
3717                      ptype->dev == orig_dev)) {
3718                         if (pt_prev)
3719                                 ret = deliver_skb(skb, pt_prev, orig_dev);
3720                         pt_prev = ptype;
3721                 }
3722         }
3723
3724         if (pt_prev) {
3725                 if (unlikely(skb_orphan_frags(skb, GFP_ATOMIC)))
3726                         goto drop;
3727                 else
3728                         ret = pt_prev->func(skb, skb->dev, pt_prev, orig_dev);
3729         } else {
3730 drop:
3731                 atomic_long_inc(&skb->dev->rx_dropped);
3732                 kfree_skb(skb);
3733                 /* Jamal, now you will not able to escape explaining
3734                  * me how you were going to use this. :-)
3735                  */
3736                 ret = NET_RX_DROP;
3737         }
3738
3739 out:
3740         return ret;
3741 }
3742
3743 static int __netif_receive_skb(struct sk_buff *skb)
3744 {
3745         int ret;
3746
3747         if (sk_memalloc_socks() && skb_pfmemalloc(skb)) {
3748                 unsigned long pflags = current->flags;
3749
3750                 /*
3751                  * PFMEMALLOC skbs are special, they should
3752                  * - be delivered to SOCK_MEMALLOC sockets only
3753                  * - stay away from userspace
3754                  * - have bounded memory usage
3755                  *
3756                  * Use PF_MEMALLOC as this saves us from propagating the allocation
3757                  * context down to all allocation sites.
3758                  */
3759                 current->flags |= PF_MEMALLOC;
3760                 ret = __netif_receive_skb_core(skb, true);
3761                 tsk_restore_flags(current, pflags, PF_MEMALLOC);
3762         } else
3763                 ret = __netif_receive_skb_core(skb, false);
3764
3765         return ret;
3766 }
3767
3768 static int netif_receive_skb_internal(struct sk_buff *skb)
3769 {
3770         int ret;
3771
3772         net_timestamp_check(netdev_tstamp_prequeue, skb);
3773
3774         if (skb_defer_rx_timestamp(skb))
3775                 return NET_RX_SUCCESS;
3776
3777         rcu_read_lock();
3778
3779 #ifdef CONFIG_RPS
3780         if (static_key_false(&rps_needed)) {
3781                 struct rps_dev_flow voidflow, *rflow = &voidflow;
3782                 int cpu = get_rps_cpu(skb->dev, skb, &rflow);
3783
3784                 if (cpu >= 0) {
3785                         ret = enqueue_to_backlog(skb, cpu, &rflow->last_qtail);
3786                         rcu_read_unlock();
3787                         return ret;
3788                 }
3789         }
3790 #endif
3791         ret = __netif_receive_skb(skb);
3792         rcu_read_unlock();
3793         return ret;
3794 }
3795
3796 /**
3797  *      netif_receive_skb - process receive buffer from network
3798  *      @skb: buffer to process
3799  *
3800  *      netif_receive_skb() is the main receive data processing function.
3801  *      It always succeeds. The buffer may be dropped during processing
3802  *      for congestion control or by the protocol layers.
3803  *
3804  *      This function may only be called from softirq context and interrupts
3805  *      should be enabled.
3806  *
3807  *      Return values (usually ignored):
3808  *      NET_RX_SUCCESS: no congestion
3809  *      NET_RX_DROP: packet was dropped
3810  */
3811 int netif_receive_skb(struct sk_buff *skb)
3812 {
3813         trace_netif_receive_skb_entry(skb);
3814
3815         return netif_receive_skb_internal(skb);
3816 }
3817 EXPORT_SYMBOL(netif_receive_skb);
3818
3819 /* Network device is going away, flush any packets still pending
3820  * Called with irqs disabled.
3821  */
3822 static void flush_backlog(void *arg)
3823 {
3824         struct net_device *dev = arg;
3825         struct softnet_data *sd = &__get_cpu_var(softnet_data);
3826         struct sk_buff *skb, *tmp;
3827
3828         rps_lock(sd);
3829         skb_queue_walk_safe(&sd->input_pkt_queue, skb, tmp) {
3830                 if (skb->dev == dev) {
3831                         __skb_unlink(skb, &sd->input_pkt_queue);
3832                         kfree_skb(skb);
3833                         input_queue_head_incr(sd);
3834                 }
3835         }
3836         rps_unlock(sd);
3837
3838         skb_queue_walk_safe(&sd->process_queue, skb, tmp) {
3839                 if (skb->dev == dev) {
3840                         __skb_unlink(skb, &sd->process_queue);
3841                         kfree_skb(skb);
3842                         input_queue_head_incr(sd);
3843                 }
3844         }
3845 }
3846
3847 static int napi_gro_complete(struct sk_buff *skb)
3848 {
3849         struct packet_offload *ptype;
3850         __be16 type = skb->protocol;
3851         struct list_head *head = &offload_base;
3852         int err = -ENOENT;
3853
3854         BUILD_BUG_ON(sizeof(struct napi_gro_cb) > sizeof(skb->cb));
3855
3856         if (NAPI_GRO_CB(skb)->count == 1) {
3857                 skb_shinfo(skb)->gso_size = 0;
3858                 goto out;
3859         }
3860
3861         rcu_read_lock();
3862         list_for_each_entry_rcu(ptype, head, list) {
3863                 if (ptype->type != type || !ptype->callbacks.gro_complete)
3864                         continue;
3865
3866                 err = ptype->callbacks.gro_complete(skb, 0);
3867                 break;
3868         }
3869         rcu_read_unlock();
3870
3871         if (err) {
3872                 WARN_ON(&ptype->list == head);
3873                 kfree_skb(skb);
3874                 return NET_RX_SUCCESS;
3875         }
3876
3877 out:
3878         return netif_receive_skb_internal(skb);
3879 }
3880
3881 /* napi->gro_list contains packets ordered by age.
3882  * youngest packets at the head of it.
3883  * Complete skbs in reverse order to reduce latencies.
3884  */
3885 void napi_gro_flush(struct napi_struct *napi, bool flush_old)
3886 {
3887         struct sk_buff *skb, *prev = NULL;
3888
3889         /* scan list and build reverse chain */
3890         for (skb = napi->gro_list; skb != NULL; skb = skb->next) {
3891                 skb->prev = prev;
3892                 prev = skb;
3893         }
3894
3895         for (skb = prev; skb; skb = prev) {
3896                 skb->next = NULL;
3897
3898                 if (flush_old && NAPI_GRO_CB(skb)->age == jiffies)
3899                         return;
3900
3901                 prev = skb->prev;
3902                 napi_gro_complete(skb);
3903                 napi->gro_count--;
3904         }
3905
3906         napi->gro_list = NULL;
3907 }
3908 EXPORT_SYMBOL(napi_gro_flush);
3909
3910 static void gro_list_prepare(struct napi_struct *napi, struct sk_buff *skb)
3911 {
3912         struct sk_buff *p;
3913         unsigned int maclen = skb->dev->hard_header_len;
3914         u32 hash = skb_get_hash_raw(skb);
3915
3916         for (p = napi->gro_list; p; p = p->next) {
3917                 unsigned long diffs;
3918
3919                 NAPI_GRO_CB(p)->flush = 0;
3920
3921                 if (hash != skb_get_hash_raw(p)) {
3922                         NAPI_GRO_CB(p)->same_flow = 0;
3923                         continue;
3924                 }
3925
3926                 diffs = (unsigned long)p->dev ^ (unsigned long)skb->dev;
3927                 diffs |= p->vlan_tci ^ skb->vlan_tci;
3928                 if (maclen == ETH_HLEN)
3929                         diffs |= compare_ether_header(skb_mac_header(p),
3930                                                       skb_mac_header(skb));
3931                 else if (!diffs)
3932                         diffs = memcmp(skb_mac_header(p),
3933                                        skb_mac_header(skb),
3934                                        maclen);
3935                 NAPI_GRO_CB(p)->same_flow = !diffs;
3936         }
3937 }
3938
3939 static void skb_gro_reset_offset(struct sk_buff *skb)
3940 {
3941         const struct skb_shared_info *pinfo = skb_shinfo(skb);
3942         const skb_frag_t *frag0 = &pinfo->frags[0];
3943
3944         NAPI_GRO_CB(skb)->data_offset = 0;
3945         NAPI_GRO_CB(skb)->frag0 = NULL;
3946         NAPI_GRO_CB(skb)->frag0_len = 0;
3947
3948         if (skb_mac_header(skb) == skb_tail_pointer(skb) &&
3949             pinfo->nr_frags &&
3950             !PageHighMem(skb_frag_page(frag0))) {
3951                 NAPI_GRO_CB(skb)->frag0 = skb_frag_address(frag0);
3952                 NAPI_GRO_CB(skb)->frag0_len = min_t(unsigned int,
3953                                                     skb_frag_size(frag0),
3954                                                     skb->end - skb->tail);
3955         }
3956 }
3957
3958 static void gro_pull_from_frag0(struct sk_buff *skb, int grow)
3959 {
3960         struct skb_shared_info *pinfo = skb_shinfo(skb);
3961
3962         BUG_ON(skb->end - skb->tail < grow);
3963
3964         memcpy(skb_tail_pointer(skb), NAPI_GRO_CB(skb)->frag0, grow);
3965
3966         skb->data_len -= grow;
3967         skb->tail += grow;
3968
3969         pinfo->frags[0].page_offset += grow;
3970         skb_frag_size_sub(&pinfo->frags[0], grow);
3971
3972         if (unlikely(!skb_frag_size(&pinfo->frags[0]))) {
3973                 skb_frag_unref(skb, 0);
3974                 memmove(pinfo->frags, pinfo->frags + 1,
3975                         --pinfo->nr_frags * sizeof(pinfo->frags[0]));
3976         }
3977 }
3978
3979 static enum gro_result dev_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
3980 {
3981         struct sk_buff **pp = NULL;
3982         struct packet_offload *ptype;
3983         __be16 type = skb->protocol;
3984         struct list_head *head = &offload_base;
3985         int same_flow;
3986         enum gro_result ret;
3987         int grow;
3988
3989         if (!(skb->dev->features & NETIF_F_GRO))
3990                 goto normal;
3991
3992         if (skb_is_gso(skb) || skb_has_frag_list(skb))
3993                 goto normal;
3994
3995         gro_list_prepare(napi, skb);
3996         NAPI_GRO_CB(skb)->csum = skb->csum; /* Needed for CHECKSUM_COMPLETE */
3997
3998         rcu_read_lock();
3999         list_for_each_entry_rcu(ptype, head, list) {
4000                 if (ptype->type != type || !ptype->callbacks.gro_receive)
4001                         continue;
4002
4003                 skb_set_network_header(skb, skb_gro_offset(skb));
4004                 skb_reset_mac_len(skb);
4005                 NAPI_GRO_CB(skb)->same_flow = 0;
4006                 NAPI_GRO_CB(skb)->flush = 0;
4007                 NAPI_GRO_CB(skb)->free = 0;
4008                 NAPI_GRO_CB(skb)->encap_mark = 0;
4009
4010                 pp = ptype->callbacks.gro_receive(&napi->gro_list, skb);
4011                 break;
4012         }
4013         rcu_read_unlock();
4014
4015         if (&ptype->list == head)
4016                 goto normal;
4017
4018         same_flow = NAPI_GRO_CB(skb)->same_flow;
4019         ret = NAPI_GRO_CB(skb)->free ? GRO_MERGED_FREE : GRO_MERGED;
4020
4021         if (pp) {
4022                 struct sk_buff *nskb = *pp;
4023
4024                 *pp = nskb->next;
4025                 nskb->next = NULL;
4026                 napi_gro_complete(nskb);
4027                 napi->gro_count--;
4028         }
4029
4030         if (same_flow)
4031                 goto ok;
4032
4033         if (NAPI_GRO_CB(skb)->flush)
4034                 goto normal;
4035
4036         if (unlikely(napi->gro_count >= MAX_GRO_SKBS)) {
4037                 struct sk_buff *nskb = napi->gro_list;
4038
4039                 /* locate the end of the list to select the 'oldest' flow */
4040                 while (nskb->next) {
4041                         pp = &nskb->next;
4042                         nskb = *pp;
4043                 }
4044                 *pp = NULL;
4045                 nskb->next = NULL;
4046                 napi_gro_complete(nskb);
4047         } else {
4048                 napi->gro_count++;
4049         }
4050         NAPI_GRO_CB(skb)->count = 1;
4051         NAPI_GRO_CB(skb)->age = jiffies;
4052         NAPI_GRO_CB(skb)->last = skb;
4053         skb_shinfo(skb)->gso_size = skb_gro_len(skb);
4054         skb->next = napi->gro_list;
4055         napi->gro_list = skb;
4056         ret = GRO_HELD;
4057
4058 pull:
4059         grow = skb_gro_offset(skb) - skb_headlen(skb);
4060         if (grow > 0)
4061                 gro_pull_from_frag0(skb, grow);
4062 ok:
4063         return ret;
4064
4065 normal:
4066         ret = GRO_NORMAL;
4067         goto pull;
4068 }
4069
4070 struct packet_offload *gro_find_receive_by_type(__be16 type)
4071 {
4072         struct list_head *offload_head = &offload_base;
4073         struct packet_offload *ptype;
4074
4075         list_for_each_entry_rcu(ptype, offload_head, list) {
4076                 if (ptype->type != type || !ptype->callbacks.gro_receive)
4077                         continue;
4078                 return ptype;
4079         }
4080         return NULL;
4081 }
4082 EXPORT_SYMBOL(gro_find_receive_by_type);
4083
4084 struct packet_offload *gro_find_complete_by_type(__be16 type)
4085 {
4086         struct list_head *offload_head = &offload_base;
4087         struct packet_offload *ptype;
4088
4089         list_for_each_entry_rcu(ptype, offload_head, list) {
4090                 if (ptype->type != type || !ptype->callbacks.gro_complete)
4091                         continue;
4092                 return ptype;
4093         }
4094         return NULL;
4095 }
4096 EXPORT_SYMBOL(gro_find_complete_by_type);
4097
4098 static gro_result_t napi_skb_finish(gro_result_t ret, struct sk_buff *skb)
4099 {
4100         switch (ret) {
4101         case GRO_NORMAL:
4102                 if (netif_receive_skb_internal(skb))
4103                         ret = GRO_DROP;
4104                 break;
4105
4106         case GRO_DROP:
4107                 kfree_skb(skb);
4108                 break;
4109
4110         case GRO_MERGED_FREE:
4111                 if (NAPI_GRO_CB(skb)->free == NAPI_GRO_FREE_STOLEN_HEAD)
4112                         kmem_cache_free(skbuff_head_cache, skb);
4113                 else
4114                         __kfree_skb(skb);
4115                 break;
4116
4117         case GRO_HELD:
4118         case GRO_MERGED:
4119                 break;
4120         }
4121
4122         return ret;
4123 }
4124
4125 gro_result_t napi_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
4126 {
4127         trace_napi_gro_receive_entry(skb);
4128
4129         skb_gro_reset_offset(skb);
4130
4131         return napi_skb_finish(dev_gro_receive(napi, skb), skb);
4132 }
4133 EXPORT_SYMBOL(napi_gro_receive);
4134
4135 static void napi_reuse_skb(struct napi_struct *napi, struct sk_buff *skb)
4136 {
4137         __skb_pull(skb, skb_headlen(skb));
4138         /* restore the reserve we had after netdev_alloc_skb_ip_align() */
4139         skb_reserve(skb, NET_SKB_PAD + NET_IP_ALIGN - skb_headroom(skb));
4140         skb->vlan_tci = 0;
4141         skb->dev = napi->dev;
4142         skb->skb_iif = 0;
4143         skb->encapsulation = 0;
4144         skb_shinfo(skb)->gso_type = 0;
4145         skb->truesize = SKB_TRUESIZE(skb_end_offset(skb));
4146
4147         napi->skb = skb;
4148 }
4149
4150 struct sk_buff *napi_get_frags(struct napi_struct *napi)
4151 {
4152         struct sk_buff *skb = napi->skb;
4153
4154         if (!skb) {
4155                 skb = netdev_alloc_skb_ip_align(napi->dev, GRO_MAX_HEAD);
4156                 napi->skb = skb;
4157         }
4158         return skb;
4159 }
4160 EXPORT_SYMBOL(napi_get_frags);
4161
4162 static gro_result_t napi_frags_finish(struct napi_struct *napi,
4163                                       struct sk_buff *skb,
4164                                       gro_result_t ret)
4165 {
4166         switch (ret) {
4167         case GRO_NORMAL:
4168         case GRO_HELD:
4169                 __skb_push(skb, ETH_HLEN);
4170                 skb->protocol = eth_type_trans(skb, skb->dev);
4171                 if (ret == GRO_NORMAL && netif_receive_skb_internal(skb))
4172                         ret = GRO_DROP;
4173                 break;
4174
4175         case GRO_DROP:
4176                 napi_reuse_skb(napi, skb);
4177                 break;
4178
4179         case GRO_MERGED_FREE:
4180                 if (NAPI_GRO_CB(skb)->free == NAPI_GRO_FREE_STOLEN_HEAD)
4181                         kmem_cache_free(skbuff_head_cache, skb);
4182                 else
4183                         napi_reuse_skb(napi, skb);
4184                 break;
4185
4186         case GRO_MERGED:
4187                 break;
4188         }
4189
4190         return ret;
4191 }
4192
4193 /* Upper GRO stack assumes network header starts at gro_offset=0
4194  * Drivers could call both napi_gro_frags() and napi_gro_receive()
4195  * We copy ethernet header into skb->data to have a common layout.
4196  */
4197 static struct sk_buff *napi_frags_skb(struct napi_struct *napi)
4198 {
4199         struct sk_buff *skb = napi->skb;
4200         const struct ethhdr *eth;
4201         unsigned int hlen = sizeof(*eth);
4202
4203         napi->skb = NULL;
4204
4205         skb_reset_mac_header(skb);
4206         skb_gro_reset_offset(skb);
4207
4208         eth = skb_gro_header_fast(skb, 0);
4209         if (unlikely(skb_gro_header_hard(skb, hlen))) {
4210                 eth = skb_gro_header_slow(skb, hlen, 0);
4211                 if (unlikely(!eth)) {
4212                         napi_reuse_skb(napi, skb);
4213                         return NULL;
4214                 }
4215         } else {
4216                 gro_pull_from_frag0(skb, hlen);
4217                 NAPI_GRO_CB(skb)->frag0 += hlen;
4218                 NAPI_GRO_CB(skb)->frag0_len -= hlen;
4219         }
4220         __skb_pull(skb, hlen);
4221
4222         /*
4223          * This works because the only protocols we care about don't require
4224          * special handling.
4225          * We'll fix it up properly in napi_frags_finish()
4226          */
4227         skb->protocol = eth->h_proto;
4228
4229         return skb;
4230 }
4231
4232 gro_result_t napi_gro_frags(struct napi_struct *napi)
4233 {
4234         struct sk_buff *skb = napi_frags_skb(napi);
4235
4236         if (!skb)
4237                 return GRO_DROP;
4238
4239         trace_napi_gro_frags_entry(skb);
4240
4241         return napi_frags_finish(napi, skb, dev_gro_receive(napi, skb));
4242 }
4243 EXPORT_SYMBOL(napi_gro_frags);
4244
4245 /*
4246  * net_rps_action_and_irq_enable sends any pending IPI's for rps.
4247  * Note: called with local irq disabled, but exits with local irq enabled.
4248  */
4249 static void net_rps_action_and_irq_enable(struct softnet_data *sd)
4250 {
4251 #ifdef CONFIG_RPS
4252         struct softnet_data *remsd = sd->rps_ipi_list;
4253
4254         if (remsd) {
4255                 sd->rps_ipi_list = NULL;
4256
4257                 local_irq_enable();
4258
4259                 /* Send pending IPI's to kick RPS processing on remote cpus. */
4260                 while (remsd) {
4261                         struct softnet_data *next = remsd->rps_ipi_next;
4262
4263                         if (cpu_online(remsd->cpu))
4264                                 smp_call_function_single_async(remsd->cpu,
4265                                                            &remsd->csd);
4266                         remsd = next;
4267                 }
4268         } else
4269 #endif
4270                 local_irq_enable();
4271 }
4272
4273 static int process_backlog(struct napi_struct *napi, int quota)
4274 {
4275         int work = 0;
4276         struct softnet_data *sd = container_of(napi, struct softnet_data, backlog);
4277
4278 #ifdef CONFIG_RPS
4279         /* Check if we have pending ipi, its better to send them now,
4280          * not waiting net_rx_action() end.
4281          */
4282         if (sd->rps_ipi_list) {
4283                 local_irq_disable();
4284                 net_rps_action_and_irq_enable(sd);
4285         }
4286 #endif
4287         napi->weight = weight_p;
4288         local_irq_disable();
4289         while (1) {
4290                 struct sk_buff *skb;
4291
4292                 while ((skb = __skb_dequeue(&sd->process_queue))) {
4293                         rcu_read_lock();
4294                         local_irq_enable();
4295                         __netif_receive_skb(skb);
4296                         rcu_read_unlock();
4297                         local_irq_disable();
4298                         input_queue_head_incr(sd);
4299                         if (++work >= quota) {
4300                                 local_irq_enable();
4301                                 return work;
4302                         }
4303                 }
4304
4305                 rps_lock(sd);
4306                 if (skb_queue_empty(&sd->input_pkt_queue)) {
4307                         /*
4308                          * Inline a custom version of __napi_complete().
4309                          * only current cpu owns and manipulates this napi,
4310                          * and NAPI_STATE_SCHED is the only possible flag set
4311                          * on backlog.
4312                          * We can use a plain write instead of clear_bit(),
4313                          * and we dont need an smp_mb() memory barrier.
4314                          */
4315                         list_del(&napi->poll_list);
4316                         napi->state = 0;
4317                         rps_unlock(sd);
4318
4319                         break;
4320                 }
4321
4322                 skb_queue_splice_tail_init(&sd->input_pkt_queue,
4323                                            &sd->process_queue);
4324                 rps_unlock(sd);
4325         }
4326         local_irq_enable();
4327
4328         return work;
4329 }
4330
4331 /**
4332  * __napi_schedule - schedule for receive
4333  * @n: entry to schedule
4334  *
4335  * The entry's receive function will be scheduled to run
4336  */
4337 void __napi_schedule(struct napi_struct *n)
4338 {
4339         unsigned long flags;
4340
4341         local_irq_save(flags);
4342         ____napi_schedule(&__get_cpu_var(softnet_data), n);
4343         local_irq_restore(flags);
4344 }
4345 EXPORT_SYMBOL(__napi_schedule);
4346
4347 void __napi_complete(struct napi_struct *n)
4348 {
4349         BUG_ON(!test_bit(NAPI_STATE_SCHED, &n->state));
4350         BUG_ON(n->gro_list);
4351
4352         list_del(&n->poll_list);
4353         smp_mb__before_atomic();
4354         clear_bit(NAPI_STATE_SCHED, &n->state);
4355 }
4356 EXPORT_SYMBOL(__napi_complete);
4357
4358 void napi_complete(struct napi_struct *n)
4359 {
4360         unsigned long flags;
4361
4362         /*
4363          * don't let napi dequeue from the cpu poll list
4364          * just in case its running on a different cpu
4365          */
4366         if (unlikely(test_bit(NAPI_STATE_NPSVC, &n->state)))
4367                 return;
4368
4369         napi_gro_flush(n, false);
4370         local_irq_save(flags);
4371         __napi_complete(n);
4372         local_irq_restore(flags);
4373 }
4374 EXPORT_SYMBOL(napi_complete);
4375
4376 /* must be called under rcu_read_lock(), as we dont take a reference */
4377 struct napi_struct *napi_by_id(unsigned int napi_id)
4378 {
4379         unsigned int hash = napi_id % HASH_SIZE(napi_hash);
4380         struct napi_struct *napi;
4381
4382         hlist_for_each_entry_rcu(napi, &napi_hash[hash], napi_hash_node)
4383                 if (napi->napi_id == napi_id)
4384                         return napi;
4385
4386         return NULL;
4387 }
4388 EXPORT_SYMBOL_GPL(napi_by_id);
4389
4390 void napi_hash_add(struct napi_struct *napi)
4391 {
4392         if (!test_and_set_bit(NAPI_STATE_HASHED, &napi->state)) {
4393
4394                 spin_lock(&napi_hash_lock);
4395
4396                 /* 0 is not a valid id, we also skip an id that is taken
4397                  * we expect both events to be extremely rare
4398                  */
4399                 napi->napi_id = 0;
4400                 while (!napi->napi_id) {
4401                         napi->napi_id = ++napi_gen_id;
4402                         if (napi_by_id(napi->napi_id))
4403                                 napi->napi_id = 0;
4404                 }
4405
4406                 hlist_add_head_rcu(&napi->napi_hash_node,
4407                         &napi_hash[napi->napi_id % HASH_SIZE(napi_hash)]);
4408
4409                 spin_unlock(&napi_hash_lock);
4410         }
4411 }
4412 EXPORT_SYMBOL_GPL(napi_hash_add);
4413
4414 /* Warning : caller is responsible to make sure rcu grace period
4415  * is respected before freeing memory containing @napi
4416  */
4417 void napi_hash_del(struct napi_struct *napi)
4418 {
4419         spin_lock(&napi_hash_lock);
4420
4421         if (test_and_clear_bit(NAPI_STATE_HASHED, &napi->state))
4422                 hlist_del_rcu(&napi->napi_hash_node);
4423
4424         spin_unlock(&napi_hash_lock);
4425 }
4426 EXPORT_SYMBOL_GPL(napi_hash_del);
4427
4428 void netif_napi_add(struct net_device *dev, struct napi_struct *napi,
4429                     int (*poll)(struct napi_struct *, int), int weight)
4430 {
4431         INIT_LIST_HEAD(&napi->poll_list);
4432         napi->gro_count = 0;
4433         napi->gro_list = NULL;
4434         napi->skb = NULL;
4435         napi->poll = poll;
4436         if (weight > NAPI_POLL_WEIGHT)
4437                 pr_err_once("netif_napi_add() called with weight %d on device %s\n",
4438                             weight, dev->name);
4439         napi->weight = weight;
4440         list_add(&napi->dev_list, &dev->napi_list);
4441         napi->dev = dev;
4442 #ifdef CONFIG_NETPOLL
4443         spin_lock_init(&napi->poll_lock);
4444         napi->poll_owner = -1;
4445 #endif
4446         set_bit(NAPI_STATE_SCHED, &napi->state);
4447 }
4448 EXPORT_SYMBOL(netif_napi_add);
4449
4450 void netif_napi_del(struct napi_struct *napi)
4451 {
4452         list_del_init(&napi->dev_list);
4453         napi_free_frags(napi);
4454
4455         kfree_skb_list(napi->gro_list);
4456         napi->gro_list = NULL;
4457         napi->gro_count = 0;
4458 }
4459 EXPORT_SYMBOL(netif_napi_del);
4460
4461 static void net_rx_action(struct softirq_action *h)
4462 {
4463         struct softnet_data *sd = &__get_cpu_var(softnet_data);
4464         unsigned long time_limit = jiffies + 2;
4465         int budget = netdev_budget;
4466         void *have;
4467
4468         local_irq_disable();
4469
4470         while (!list_empty(&sd->poll_list)) {
4471                 struct napi_struct *n;
4472                 int work, weight;
4473
4474                 /* If softirq window is exhuasted then punt.
4475                  * Allow this to run for 2 jiffies since which will allow
4476                  * an average latency of 1.5/HZ.
4477                  */
4478                 if (unlikely(budget <= 0 || time_after_eq(jiffies, time_limit)))
4479                         goto softnet_break;
4480
4481                 local_irq_enable();
4482
4483                 /* Even though interrupts have been re-enabled, this
4484                  * access is safe because interrupts can only add new
4485                  * entries to the tail of this list, and only ->poll()
4486                  * calls can remove this head entry from the list.
4487                  */
4488                 n = list_first_entry(&sd->poll_list, struct napi_struct, poll_list);
4489
4490                 have = netpoll_poll_lock(n);
4491
4492                 weight = n->weight;
4493
4494                 /* This NAPI_STATE_SCHED test is for avoiding a race
4495                  * with netpoll's poll_napi().  Only the entity which
4496                  * obtains the lock and sees NAPI_STATE_SCHED set will
4497                  * actually make the ->poll() call.  Therefore we avoid
4498                  * accidentally calling ->poll() when NAPI is not scheduled.
4499                  */
4500                 work = 0;
4501                 if (test_bit(NAPI_STATE_SCHED, &n->state)) {
4502                         work = n->poll(n, weight);
4503                         trace_napi_poll(n);
4504                 }
4505
4506                 WARN_ON_ONCE(work > weight);
4507
4508                 budget -= work;
4509
4510                 local_irq_disable();
4511
4512                 /* Drivers must not modify the NAPI state if they
4513                  * consume the entire weight.  In such cases this code
4514                  * still "owns" the NAPI instance and therefore can
4515                  * move the instance around on the list at-will.
4516                  */
4517                 if (unlikely(work == weight)) {
4518                         if (unlikely(napi_disable_pending(n))) {
4519                                 local_irq_enable();
4520                                 napi_complete(n);
4521                                 local_irq_disable();
4522                         } else {
4523                                 if (n->gro_list) {
4524                                         /* flush too old packets
4525                                          * If HZ < 1000, flush all packets.
4526                                          */
4527                                         local_irq_enable();
4528                                         napi_gro_flush(n, HZ >= 1000);
4529                                         local_irq_disable();
4530                                 }
4531                                 list_move_tail(&n->poll_list, &sd->poll_list);
4532                         }
4533                 }
4534
4535                 netpoll_poll_unlock(have);
4536         }
4537 out:
4538         net_rps_action_and_irq_enable(sd);
4539
4540 #ifdef CONFIG_NET_DMA
4541         /*
4542          * There may not be any more sk_buffs coming right now, so push
4543          * any pending DMA copies to hardware
4544          */
4545         dma_issue_pending_all();
4546 #endif
4547
4548         return;
4549
4550 softnet_break:
4551         sd->time_squeeze++;
4552         __raise_softirq_irqoff(NET_RX_SOFTIRQ);
4553         goto out;
4554 }
4555
4556 struct netdev_adjacent {
4557         struct net_device *dev;
4558
4559         /* upper master flag, there can only be one master device per list */
4560         bool master;
4561
4562         /* counter for the number of times this device was added to us */
4563         u16 ref_nr;
4564
4565         /* private field for the users */
4566         void *private;
4567
4568         struct list_head list;
4569         struct rcu_head rcu;
4570 };
4571
4572 static struct netdev_adjacent *__netdev_find_adj(struct net_device *dev,
4573                                                  struct net_device *adj_dev,
4574                                                  struct list_head *adj_list)
4575 {
4576         struct netdev_adjacent *adj;
4577
4578         list_for_each_entry(adj, adj_list, list) {
4579                 if (adj->dev == adj_dev)
4580                         return adj;
4581         }
4582         return NULL;
4583 }
4584
4585 /**
4586  * netdev_has_upper_dev - Check if device is linked to an upper device
4587  * @dev: device
4588  * @upper_dev: upper device to check
4589  *
4590  * Find out if a device is linked to specified upper device and return true
4591  * in case it is. Note that this checks only immediate upper device,
4592  * not through a complete stack of devices. The caller must hold the RTNL lock.
4593  */
4594 bool netdev_has_upper_dev(struct net_device *dev,
4595                           struct net_device *upper_dev)
4596 {
4597         ASSERT_RTNL();
4598
4599         return __netdev_find_adj(dev, upper_dev, &dev->all_adj_list.upper);
4600 }
4601 EXPORT_SYMBOL(netdev_has_upper_dev);
4602
4603 /**
4604  * netdev_has_any_upper_dev - Check if device is linked to some device
4605  * @dev: device
4606  *
4607  * Find out if a device is linked to an upper device and return true in case
4608  * it is. The caller must hold the RTNL lock.
4609  */
4610 static bool netdev_has_any_upper_dev(struct net_device *dev)
4611 {
4612         ASSERT_RTNL();
4613
4614         return !list_empty(&dev->all_adj_list.upper);
4615 }
4616
4617 /**
4618  * netdev_master_upper_dev_get - Get master upper device
4619  * @dev: device
4620  *
4621  * Find a master upper device and return pointer to it or NULL in case
4622  * it's not there. The caller must hold the RTNL lock.
4623  */
4624 struct net_device *netdev_master_upper_dev_get(struct net_device *dev)
4625 {
4626         struct netdev_adjacent *upper;
4627
4628         ASSERT_RTNL();
4629
4630         if (list_empty(&dev->adj_list.upper))
4631                 return NULL;
4632
4633         upper = list_first_entry(&dev->adj_list.upper,
4634                                  struct netdev_adjacent, list);
4635         if (likely(upper->master))
4636                 return upper->dev;
4637         return NULL;
4638 }
4639 EXPORT_SYMBOL(netdev_master_upper_dev_get);
4640
4641 void *netdev_adjacent_get_private(struct list_head *adj_list)
4642 {
4643         struct netdev_adjacent *adj;
4644
4645         adj = list_entry(adj_list, struct netdev_adjacent, list);
4646
4647         return adj->private;
4648 }
4649 EXPORT_SYMBOL(netdev_adjacent_get_private);
4650
4651 /**
4652  * netdev_upper_get_next_dev_rcu - Get the next dev from upper list
4653  * @dev: device
4654  * @iter: list_head ** of the current position
4655  *
4656  * Gets the next device from the dev's upper list, starting from iter
4657  * position. The caller must hold RCU read lock.
4658  */
4659 struct net_device *netdev_upper_get_next_dev_rcu(struct net_device *dev,
4660                                                  struct list_head **iter)
4661 {
4662         struct netdev_adjacent *upper;
4663
4664         WARN_ON_ONCE(!rcu_read_lock_held() && !lockdep_rtnl_is_held());
4665
4666         upper = list_entry_rcu((*iter)->next, struct netdev_adjacent, list);
4667
4668         if (&upper->list == &dev->adj_list.upper)
4669                 return NULL;
4670
4671         *iter = &upper->list;
4672
4673         return upper->dev;
4674 }
4675 EXPORT_SYMBOL(netdev_upper_get_next_dev_rcu);
4676
4677 /**
4678  * netdev_all_upper_get_next_dev_rcu - Get the next dev from upper list
4679  * @dev: device
4680  * @iter: list_head ** of the current position
4681  *
4682  * Gets the next device from the dev's upper list, starting from iter
4683  * position. The caller must hold RCU read lock.
4684  */
4685 struct net_device *netdev_all_upper_get_next_dev_rcu(struct net_device *dev,
4686                                                      struct list_head **iter)
4687 {
4688         struct netdev_adjacent *upper;
4689
4690         WARN_ON_ONCE(!rcu_read_lock_held() && !lockdep_rtnl_is_held());
4691
4692         upper = list_entry_rcu((*iter)->next, struct netdev_adjacent, list);
4693
4694         if (&upper->list == &dev->all_adj_list.upper)
4695                 return NULL;
4696
4697         *iter = &upper->list;
4698
4699         return upper->dev;
4700 }
4701 EXPORT_SYMBOL(netdev_all_upper_get_next_dev_rcu);
4702
4703 /**
4704  * netdev_lower_get_next_private - Get the next ->private from the
4705  *                                 lower neighbour list
4706  * @dev: device
4707  * @iter: list_head ** of the current position
4708  *
4709  * Gets the next netdev_adjacent->private from the dev's lower neighbour
4710  * list, starting from iter position. The caller must hold either hold the
4711  * RTNL lock or its own locking that guarantees that the neighbour lower
4712  * list will remain unchainged.
4713  */
4714 void *netdev_lower_get_next_private(struct net_device *dev,
4715                                     struct list_head **iter)
4716 {
4717         struct netdev_adjacent *lower;
4718
4719         lower = list_entry(*iter, struct netdev_adjacent, list);
4720
4721         if (&lower->list == &dev->adj_list.lower)
4722                 return NULL;
4723
4724         *iter = lower->list.next;
4725
4726         return lower->private;
4727 }
4728 EXPORT_SYMBOL(netdev_lower_get_next_private);
4729
4730 /**
4731  * netdev_lower_get_next_private_rcu - Get the next ->private from the
4732  *                                     lower neighbour list, RCU
4733  *                                     variant
4734  * @dev: device
4735  * @iter: list_head ** of the current position
4736  *
4737  * Gets the next netdev_adjacent->private from the dev's lower neighbour
4738  * list, starting from iter position. The caller must hold RCU read lock.
4739  */
4740 void *netdev_lower_get_next_private_rcu(struct net_device *dev,
4741                                         struct list_head **iter)
4742 {
4743         struct netdev_adjacent *lower;
4744
4745         WARN_ON_ONCE(!rcu_read_lock_held());
4746
4747         lower = list_entry_rcu((*iter)->next, struct netdev_adjacent, list);
4748
4749         if (&lower->list == &dev->adj_list.lower)
4750                 return NULL;
4751
4752         *iter = &lower->list;
4753
4754         return lower->private;
4755 }
4756 EXPORT_SYMBOL(netdev_lower_get_next_private_rcu);
4757
4758 /**
4759  * netdev_lower_get_next - Get the next device from the lower neighbour
4760  *                         list
4761  * @dev: device
4762  * @iter: list_head ** of the current position
4763  *
4764  * Gets the next netdev_adjacent from the dev's lower neighbour
4765  * list, starting from iter position. The caller must hold RTNL lock or
4766  * its own locking that guarantees that the neighbour lower
4767  * list will remain unchainged.
4768  */
4769 void *netdev_lower_get_next(struct net_device *dev, struct list_head **iter)
4770 {
4771         struct netdev_adjacent *lower;
4772
4773         lower = list_entry((*iter)->next, struct netdev_adjacent, list);
4774
4775         if (&lower->list == &dev->adj_list.lower)
4776                 return NULL;
4777
4778         *iter = &lower->list;
4779
4780         return lower->dev;
4781 }
4782 EXPORT_SYMBOL(netdev_lower_get_next);
4783
4784 /**
4785  * netdev_lower_get_first_private_rcu - Get the first ->private from the
4786  *                                     lower neighbour list, RCU
4787  *                                     variant
4788  * @dev: device
4789  *
4790  * Gets the first netdev_adjacent->private from the dev's lower neighbour
4791  * list. The caller must hold RCU read lock.
4792  */
4793 void *netdev_lower_get_first_private_rcu(struct net_device *dev)
4794 {
4795         struct netdev_adjacent *lower;
4796
4797         lower = list_first_or_null_rcu(&dev->adj_list.lower,
4798                         struct netdev_adjacent, list);
4799         if (lower)
4800                 return lower->private;
4801         return NULL;
4802 }
4803 EXPORT_SYMBOL(netdev_lower_get_first_private_rcu);
4804
4805 /**
4806  * netdev_master_upper_dev_get_rcu - Get master upper device
4807  * @dev: device
4808  *
4809  * Find a master upper device and return pointer to it or NULL in case
4810  * it's not there. The caller must hold the RCU read lock.
4811  */
4812 struct net_device *netdev_master_upper_dev_get_rcu(struct net_device *dev)
4813 {
4814         struct netdev_adjacent *upper;
4815
4816         upper = list_first_or_null_rcu(&dev->adj_list.upper,
4817                                        struct netdev_adjacent, list);
4818         if (upper && likely(upper->master))
4819                 return upper->dev;
4820         return NULL;
4821 }
4822 EXPORT_SYMBOL(netdev_master_upper_dev_get_rcu);
4823
4824 static int netdev_adjacent_sysfs_add(struct net_device *dev,
4825                               struct net_device *adj_dev,
4826                               struct list_head *dev_list)
4827 {
4828         char linkname[IFNAMSIZ+7];
4829         sprintf(linkname, dev_list == &dev->adj_list.upper ?
4830                 "upper_%s" : "lower_%s", adj_dev->name);
4831         return sysfs_create_link(&(dev->dev.kobj), &(adj_dev->dev.kobj),
4832                                  linkname);
4833 }
4834 static void netdev_adjacent_sysfs_del(struct net_device *dev,
4835                                char *name,
4836                                struct list_head *dev_list)
4837 {
4838         char linkname[IFNAMSIZ+7];
4839         sprintf(linkname, dev_list == &dev->adj_list.upper ?
4840                 "upper_%s" : "lower_%s", name);
4841         sysfs_remove_link(&(dev->dev.kobj), linkname);
4842 }
4843
4844 static inline bool netdev_adjacent_is_neigh_list(struct net_device *dev,
4845                                                  struct net_device *adj_dev,
4846                                                  struct list_head *dev_list)
4847 {
4848         return (dev_list == &dev->adj_list.upper ||
4849                 dev_list == &dev->adj_list.lower) &&
4850                 net_eq(dev_net(dev), dev_net(adj_dev));
4851 }
4852
4853 static int __netdev_adjacent_dev_insert(struct net_device *dev,
4854                                         struct net_device *adj_dev,
4855                                         u16 ref_nr,
4856                                         struct list_head *dev_list,
4857                                         void *private, bool master)
4858 {
4859         struct netdev_adjacent *adj;
4860         int ret;
4861
4862         adj = __netdev_find_adj(dev, adj_dev, dev_list);
4863
4864         if (adj) {
4865                 adj->ref_nr += ref_nr;
4866                 return 0;
4867         }
4868
4869         adj = kmalloc(sizeof(*adj), GFP_KERNEL);
4870         if (!adj)
4871                 return -ENOMEM;
4872
4873         adj->dev = adj_dev;
4874         adj->master = master;
4875         adj->ref_nr = ref_nr;
4876         adj->private = private;
4877         dev_hold(adj_dev);
4878
4879         pr_debug("dev_hold for %s, because of link added from %s to %s\n",
4880                  adj_dev->name, dev->name, adj_dev->name);
4881
4882         if (netdev_adjacent_is_neigh_list(dev, adj_dev, dev_list)) {
4883                 ret = netdev_adjacent_sysfs_add(dev, adj_dev, dev_list);
4884                 if (ret)
4885                         goto free_adj;
4886         }
4887
4888         /* Ensure that master link is always the first item in list. */
4889         if (master) {
4890                 ret = sysfs_create_link(&(dev->dev.kobj),
4891                                         &(adj_dev->dev.kobj), "master");
4892                 if (ret)
4893                         goto remove_symlinks;
4894
4895                 list_add_rcu(&adj->list, dev_list);
4896         } else {
4897                 list_add_tail_rcu(&adj->list, dev_list);
4898         }
4899
4900         return 0;
4901
4902 remove_symlinks:
4903         if (netdev_adjacent_is_neigh_list(dev, adj_dev, dev_list))
4904                 netdev_adjacent_sysfs_del(dev, adj_dev->name, dev_list);
4905 free_adj:
4906         kfree(adj);
4907         dev_put(adj_dev);
4908
4909         return ret;
4910 }
4911
4912 static void __netdev_adjacent_dev_remove(struct net_device *dev,
4913                                          struct net_device *adj_dev,
4914                                          u16 ref_nr,
4915                                          struct list_head *dev_list)
4916 {
4917         struct netdev_adjacent *adj;
4918
4919         adj = __netdev_find_adj(dev, adj_dev, dev_list);
4920
4921         if (!adj) {
4922                 pr_err("tried to remove device %s from %s\n",
4923                        dev->name, adj_dev->name);
4924                 BUG();
4925         }
4926
4927         if (adj->ref_nr > ref_nr) {
4928                 pr_debug("%s to %s ref_nr-%d = %d\n", dev->name, adj_dev->name,
4929                          ref_nr, adj->ref_nr-ref_nr);
4930                 adj->ref_nr -= ref_nr;
4931                 return;
4932         }
4933
4934         if (adj->master)
4935                 sysfs_remove_link(&(dev->dev.kobj), "master");
4936
4937         if (netdev_adjacent_is_neigh_list(dev, adj_dev, dev_list))
4938                 netdev_adjacent_sysfs_del(dev, adj_dev->name, dev_list);
4939
4940         list_del_rcu(&adj->list);
4941         pr_debug("dev_put for %s, because link removed from %s to %s\n",
4942                  adj_dev->name, dev->name, adj_dev->name);
4943         dev_put(adj_dev);
4944         kfree_rcu(adj, rcu);
4945 }
4946
4947 static int __netdev_adjacent_dev_link_lists(struct net_device *dev,
4948                                             struct net_device *upper_dev,
4949                                             u16 ref_nr,
4950                                             struct list_head *up_list,
4951                                             struct list_head *down_list,
4952                                             void *private, bool master)
4953 {
4954         int ret;
4955
4956         ret = __netdev_adjacent_dev_insert(dev, upper_dev, ref_nr, up_list,
4957                                            private, master);
4958         if (ret)
4959                 return ret;
4960
4961         ret = __netdev_adjacent_dev_insert(upper_dev, dev, ref_nr, down_list,
4962                                            private, false);
4963         if (ret) {
4964                 __netdev_adjacent_dev_remove(dev, upper_dev, ref_nr, up_list);
4965                 return ret;
4966         }
4967
4968         return 0;
4969 }
4970
4971 static int __netdev_adjacent_dev_link(struct net_device *dev,
4972                                       struct net_device *upper_dev,
4973                                       u16 ref_nr)
4974 {
4975         return __netdev_adjacent_dev_link_lists(dev, upper_dev, ref_nr,
4976                                                 &dev->all_adj_list.upper,
4977                                                 &upper_dev->all_adj_list.lower,
4978                                                 NULL, false);
4979 }
4980
4981 static void __netdev_adjacent_dev_unlink_lists(struct net_device *dev,
4982                                                struct net_device *upper_dev,
4983                                                u16 ref_nr,
4984                                                struct list_head *up_list,
4985                                                struct list_head *down_list)
4986 {
4987         __netdev_adjacent_dev_remove(dev, upper_dev, ref_nr, up_list);
4988         __netdev_adjacent_dev_remove(upper_dev, dev, ref_nr, down_list);
4989 }
4990
4991 static void __netdev_adjacent_dev_unlink(struct net_device *dev,
4992                                          struct net_device *upper_dev,
4993                                          u16 ref_nr)
4994 {
4995         __netdev_adjacent_dev_unlink_lists(dev, upper_dev, ref_nr,
4996                                            &dev->all_adj_list.upper,
4997                                            &upper_dev->all_adj_list.lower);
4998 }
4999
5000 static int __netdev_adjacent_dev_link_neighbour(struct net_device *dev,
5001                                                 struct net_device *upper_dev,
5002                                                 void *private, bool master)
5003 {
5004         int ret = __netdev_adjacent_dev_link(dev, upper_dev, 1);
5005
5006         if (ret)
5007                 return ret;
5008
5009         ret = __netdev_adjacent_dev_link_lists(dev, upper_dev, 1,
5010                                                &dev->adj_list.upper,
5011                                                &upper_dev->adj_list.lower,
5012                                                private, master);
5013         if (ret) {
5014                 __netdev_adjacent_dev_unlink(dev, upper_dev, 1);
5015                 return ret;
5016         }
5017
5018         return 0;
5019 }
5020
5021 static void __netdev_adjacent_dev_unlink_neighbour(struct net_device *dev,
5022                                                    struct net_device *upper_dev)
5023 {
5024         __netdev_adjacent_dev_unlink(dev, upper_dev, 1);
5025         __netdev_adjacent_dev_unlink_lists(dev, upper_dev, 1,
5026                                            &dev->adj_list.upper,
5027                                            &upper_dev->adj_list.lower);
5028 }
5029
5030 static int __netdev_upper_dev_link(struct net_device *dev,
5031                                    struct net_device *upper_dev, bool master,
5032                                    void *private)
5033 {
5034         struct netdev_adjacent *i, *j, *to_i, *to_j;
5035         int ret = 0;
5036
5037         ASSERT_RTNL();
5038
5039         if (dev == upper_dev)
5040                 return -EBUSY;
5041
5042         /* To prevent loops, check if dev is not upper device to upper_dev. */
5043         if (__netdev_find_adj(upper_dev, dev, &upper_dev->all_adj_list.upper))
5044                 return -EBUSY;
5045
5046         if (__netdev_find_adj(dev, upper_dev, &dev->adj_list.upper))
5047                 return -EEXIST;
5048
5049         if (master && netdev_master_upper_dev_get(dev))
5050                 return -EBUSY;
5051
5052         ret = __netdev_adjacent_dev_link_neighbour(dev, upper_dev, private,
5053                                                    master);
5054         if (ret)
5055                 return ret;
5056
5057         /* Now that we linked these devs, make all the upper_dev's
5058          * all_adj_list.upper visible to every dev's all_adj_list.lower an
5059          * versa, and don't forget the devices itself. All of these
5060          * links are non-neighbours.
5061          */
5062         list_for_each_entry(i, &dev->all_adj_list.lower, list) {
5063                 list_for_each_entry(j, &upper_dev->all_adj_list.upper, list) {
5064                         pr_debug("Interlinking %s with %s, non-neighbour\n",
5065                                  i->dev->name, j->dev->name);
5066                         ret = __netdev_adjacent_dev_link(i->dev, j->dev, i->ref_nr);
5067                         if (ret)
5068                                 goto rollback_mesh;
5069                 }
5070         }
5071
5072         /* add dev to every upper_dev's upper device */
5073         list_for_each_entry(i, &upper_dev->all_adj_list.upper, list) {
5074                 pr_debug("linking %s's upper device %s with %s\n",
5075                          upper_dev->name, i->dev->name, dev->name);
5076                 ret = __netdev_adjacent_dev_link(dev, i->dev, i->ref_nr);
5077                 if (ret)
5078                         goto rollback_upper_mesh;
5079         }
5080
5081         /* add upper_dev to every dev's lower device */
5082         list_for_each_entry(i, &dev->all_adj_list.lower, list) {
5083                 pr_debug("linking %s's lower device %s with %s\n", dev->name,
5084                          i->dev->name, upper_dev->name);
5085                 ret = __netdev_adjacent_dev_link(i->dev, upper_dev, i->ref_nr);
5086                 if (ret)
5087                         goto rollback_lower_mesh;
5088         }
5089
5090         call_netdevice_notifiers(NETDEV_CHANGEUPPER, dev);
5091         return 0;
5092
5093 rollback_lower_mesh:
5094         to_i = i;
5095         list_for_each_entry(i, &dev->all_adj_list.lower, list) {
5096                 if (i == to_i)
5097                         break;
5098                 __netdev_adjacent_dev_unlink(i->dev, upper_dev, i->ref_nr);
5099         }
5100
5101         i = NULL;
5102
5103 rollback_upper_mesh:
5104         to_i = i;
5105         list_for_each_entry(i, &upper_dev->all_adj_list.upper, list) {
5106                 if (i == to_i)
5107                         break;
5108                 __netdev_adjacent_dev_unlink(dev, i->dev, i->ref_nr);
5109         }
5110
5111         i = j = NULL;
5112
5113 rollback_mesh:
5114         to_i = i;
5115         to_j = j;
5116         list_for_each_entry(i, &dev->all_adj_list.lower, list) {
5117                 list_for_each_entry(j, &upper_dev->all_adj_list.upper, list) {
5118                         if (i == to_i && j == to_j)
5119                                 break;
5120                         __netdev_adjacent_dev_unlink(i->dev, j->dev, i->ref_nr);
5121                 }
5122                 if (i == to_i)
5123                         break;
5124         }
5125
5126         __netdev_adjacent_dev_unlink_neighbour(dev, upper_dev);
5127
5128         return ret;
5129 }
5130
5131 /**
5132  * netdev_upper_dev_link - Add a link to the upper device
5133  * @dev: device
5134  * @upper_dev: new upper device
5135  *
5136  * Adds a link to device which is upper to this one. The caller must hold
5137  * the RTNL lock. On a failure a negative errno code is returned.
5138  * On success the reference counts are adjusted and the function
5139  * returns zero.
5140  */
5141 int netdev_upper_dev_link(struct net_device *dev,
5142                           struct net_device *upper_dev)
5143 {
5144         return __netdev_upper_dev_link(dev, upper_dev, false, NULL);
5145 }
5146 EXPORT_SYMBOL(netdev_upper_dev_link);
5147
5148 /**
5149  * netdev_master_upper_dev_link - Add a master link to the upper device
5150  * @dev: device
5151  * @upper_dev: new upper device
5152  *
5153  * Adds a link to device which is upper to this one. In this case, only
5154  * one master upper device can be linked, although other non-master devices
5155  * might be linked as well. The caller must hold the RTNL lock.
5156  * On a failure a negative errno code is returned. On success the reference
5157  * counts are adjusted and the function returns zero.
5158  */
5159 int netdev_master_upper_dev_link(struct net_device *dev,
5160                                  struct net_device *upper_dev)
5161 {
5162         return __netdev_upper_dev_link(dev, upper_dev, true, NULL);
5163 }
5164 EXPORT_SYMBOL(netdev_master_upper_dev_link);
5165
5166 int netdev_master_upper_dev_link_private(struct net_device *dev,
5167                                          struct net_device *upper_dev,
5168                                          void *private)
5169 {
5170         return __netdev_upper_dev_link(dev, upper_dev, true, private);
5171 }
5172 EXPORT_SYMBOL(netdev_master_upper_dev_link_private);
5173
5174 /**
5175  * netdev_upper_dev_unlink - Removes a link to upper device
5176  * @dev: device
5177  * @upper_dev: new upper device
5178  *
5179  * Removes a link to device which is upper to this one. The caller must hold
5180  * the RTNL lock.
5181  */
5182 void netdev_upper_dev_unlink(struct net_device *dev,
5183                              struct net_device *upper_dev)
5184 {
5185         struct netdev_adjacent *i, *j;
5186         ASSERT_RTNL();
5187
5188         __netdev_adjacent_dev_unlink_neighbour(dev, upper_dev);
5189
5190         /* Here is the tricky part. We must remove all dev's lower
5191          * devices from all upper_dev's upper devices and vice
5192          * versa, to maintain the graph relationship.
5193          */
5194         list_for_each_entry(i, &dev->all_adj_list.lower, list)
5195                 list_for_each_entry(j, &upper_dev->all_adj_list.upper, list)
5196                         __netdev_adjacent_dev_unlink(i->dev, j->dev, i->ref_nr);
5197
5198         /* remove also the devices itself from lower/upper device
5199          * list
5200          */
5201         list_for_each_entry(i, &dev->all_adj_list.lower, list)
5202                 __netdev_adjacent_dev_unlink(i->dev, upper_dev, i->ref_nr);
5203
5204         list_for_each_entry(i, &upper_dev->all_adj_list.upper, list)
5205                 __netdev_adjacent_dev_unlink(dev, i->dev, i->ref_nr);
5206
5207         call_netdevice_notifiers(NETDEV_CHANGEUPPER, dev);
5208 }
5209 EXPORT_SYMBOL(netdev_upper_dev_unlink);
5210
5211 void netdev_adjacent_add_links(struct net_device *dev)
5212 {
5213         struct netdev_adjacent *iter;
5214
5215         struct net *net = dev_net(dev);
5216
5217         list_for_each_entry(iter, &dev->adj_list.upper, list) {
5218                 if (!net_eq(net,dev_net(iter->dev)))
5219                         continue;
5220                 netdev_adjacent_sysfs_add(iter->dev, dev,
5221                                           &iter->dev->adj_list.lower);
5222                 netdev_adjacent_sysfs_add(dev, iter->dev,
5223                                           &dev->adj_list.upper);
5224         }
5225
5226         list_for_each_entry(iter, &dev->adj_list.lower, list) {
5227                 if (!net_eq(net,dev_net(iter->dev)))
5228                         continue;
5229                 netdev_adjacent_sysfs_add(iter->dev, dev,
5230                                           &iter->dev->adj_list.upper);
5231                 netdev_adjacent_sysfs_add(dev, iter->dev,
5232                                           &dev->adj_list.lower);
5233         }
5234 }
5235
5236 void netdev_adjacent_del_links(struct net_device *dev)
5237 {
5238         struct netdev_adjacent *iter;
5239
5240         struct net *net = dev_net(dev);
5241
5242         list_for_each_entry(iter, &dev->adj_list.upper, list) {
5243                 if (!net_eq(net,dev_net(iter->dev)))
5244                         continue;
5245                 netdev_adjacent_sysfs_del(iter->dev, dev->name,
5246                                           &iter->dev->adj_list.lower);
5247                 netdev_adjacent_sysfs_del(dev, iter->dev->name,
5248                                           &dev->adj_list.upper);
5249         }
5250
5251         list_for_each_entry(iter, &dev->adj_list.lower, list) {
5252                 if (!net_eq(net,dev_net(iter->dev)))
5253                         continue;
5254                 netdev_adjacent_sysfs_del(iter->dev, dev->name,
5255                                           &iter->dev->adj_list.upper);
5256                 netdev_adjacent_sysfs_del(dev, iter->dev->name,
5257                                           &dev->adj_list.lower);
5258         }
5259 }
5260
5261 void netdev_adjacent_rename_links(struct net_device *dev, char *oldname)
5262 {
5263         struct netdev_adjacent *iter;
5264
5265         struct net *net = dev_net(dev);
5266
5267         list_for_each_entry(iter, &dev->adj_list.upper, list) {
5268                 if (!net_eq(net,dev_net(iter->dev)))
5269                         continue;
5270                 netdev_adjacent_sysfs_del(iter->dev, oldname,
5271                                           &iter->dev->adj_list.lower);
5272                 netdev_adjacent_sysfs_add(iter->dev, dev,
5273                                           &iter->dev->adj_list.lower);
5274         }
5275
5276         list_for_each_entry(iter, &dev->adj_list.lower, list) {
5277                 if (!net_eq(net,dev_net(iter->dev)))
5278                         continue;
5279                 netdev_adjacent_sysfs_del(iter->dev, oldname,
5280                                           &iter->dev->adj_list.upper);
5281                 netdev_adjacent_sysfs_add(iter->dev, dev,
5282                                           &iter->dev->adj_list.upper);
5283         }
5284 }
5285
5286 void *netdev_lower_dev_get_private(struct net_device *dev,
5287                                    struct net_device *lower_dev)
5288 {
5289         struct netdev_adjacent *lower;
5290
5291         if (!lower_dev)
5292                 return NULL;
5293         lower = __netdev_find_adj(dev, lower_dev, &dev->adj_list.lower);
5294         if (!lower)
5295                 return NULL;
5296
5297         return lower->private;
5298 }
5299 EXPORT_SYMBOL(netdev_lower_dev_get_private);
5300
5301
5302 int dev_get_nest_level(struct net_device *dev,
5303                        bool (*type_check)(struct net_device *dev))
5304 {
5305         struct net_device *lower = NULL;
5306         struct list_head *iter;
5307         int max_nest = -1;
5308         int nest;
5309
5310         ASSERT_RTNL();
5311
5312         netdev_for_each_lower_dev(dev, lower, iter) {
5313                 nest = dev_get_nest_level(lower, type_check);
5314                 if (max_nest < nest)
5315                         max_nest = nest;
5316         }
5317
5318         if (type_check(dev))
5319                 max_nest++;
5320
5321         return max_nest;
5322 }
5323 EXPORT_SYMBOL(dev_get_nest_level);
5324
5325 static void dev_change_rx_flags(struct net_device *dev, int flags)
5326 {
5327         const struct net_device_ops *ops = dev->netdev_ops;
5328
5329         if (ops->ndo_change_rx_flags)
5330                 ops->ndo_change_rx_flags(dev, flags);
5331 }
5332
5333 static int __dev_set_promiscuity(struct net_device *dev, int inc, bool notify)
5334 {
5335         unsigned int old_flags = dev->flags;
5336         kuid_t uid;
5337         kgid_t gid;
5338
5339         ASSERT_RTNL();
5340
5341         dev->flags |= IFF_PROMISC;
5342         dev->promiscuity += inc;
5343         if (dev->promiscuity == 0) {
5344                 /*
5345                  * Avoid overflow.
5346                  * If inc causes overflow, untouch promisc and return error.
5347                  */
5348                 if (inc < 0)
5349                         dev->flags &= ~IFF_PROMISC;
5350                 else {
5351                         dev->promiscuity -= inc;
5352                         pr_warn("%s: promiscuity touches roof, set promiscuity failed. promiscuity feature of device might be broken.\n",
5353                                 dev->name);
5354                         return -EOVERFLOW;
5355                 }
5356         }
5357         if (dev->flags != old_flags) {
5358                 pr_info("device %s %s promiscuous mode\n",
5359                         dev->name,
5360                         dev->flags & IFF_PROMISC ? "entered" : "left");
5361                 if (audit_enabled) {
5362                         current_uid_gid(&uid, &gid);
5363                         audit_log(current->audit_context, GFP_ATOMIC,
5364                                 AUDIT_ANOM_PROMISCUOUS,
5365                                 "dev=%s prom=%d old_prom=%d auid=%u uid=%u gid=%u ses=%u",
5366                                 dev->name, (dev->flags & IFF_PROMISC),
5367                                 (old_flags & IFF_PROMISC),
5368                                 from_kuid(&init_user_ns, audit_get_loginuid(current)),
5369                                 from_kuid(&init_user_ns, uid),
5370                                 from_kgid(&init_user_ns, gid),
5371                                 audit_get_sessionid(current));
5372                 }
5373
5374                 dev_change_rx_flags(dev, IFF_PROMISC);
5375         }
5376         if (notify)
5377                 __dev_notify_flags(dev, old_flags, IFF_PROMISC);
5378         return 0;
5379 }
5380
5381 /**
5382  *      dev_set_promiscuity     - update promiscuity count on a device
5383  *      @dev: device
5384  *      @inc: modifier
5385  *
5386  *      Add or remove promiscuity from a device. While the count in the device
5387  *      remains above zero the interface remains promiscuous. Once it hits zero
5388  *      the device reverts back to normal filtering operation. A negative inc
5389  *      value is used to drop promiscuity on the device.
5390  *      Return 0 if successful or a negative errno code on error.
5391  */
5392 int dev_set_promiscuity(struct net_device *dev, int inc)
5393 {
5394         unsigned int old_flags = dev->flags;
5395         int err;
5396
5397         err = __dev_set_promiscuity(dev, inc, true);
5398         if (err < 0)
5399                 return err;
5400         if (dev->flags != old_flags)
5401                 dev_set_rx_mode(dev);
5402         return err;
5403 }
5404 EXPORT_SYMBOL(dev_set_promiscuity);
5405
5406 static int __dev_set_allmulti(struct net_device *dev, int inc, bool notify)
5407 {
5408         unsigned int old_flags = dev->flags, old_gflags = dev->gflags;
5409
5410         ASSERT_RTNL();
5411
5412         dev->flags |= IFF_ALLMULTI;
5413         dev->allmulti += inc;
5414         if (dev->allmulti == 0) {
5415                 /*
5416                  * Avoid overflow.
5417                  * If inc causes overflow, untouch allmulti and return error.
5418                  */
5419                 if (inc < 0)
5420                         dev->flags &= ~IFF_ALLMULTI;
5421                 else {
5422                         dev->allmulti -= inc;
5423                         pr_warn("%s: allmulti touches roof, set allmulti failed. allmulti feature of device might be broken.\n",
5424                                 dev->name);
5425                         return -EOVERFLOW;
5426                 }
5427         }
5428         if (dev->flags ^ old_flags) {
5429                 dev_change_rx_flags(dev, IFF_ALLMULTI);
5430                 dev_set_rx_mode(dev);
5431                 if (notify)
5432                         __dev_notify_flags(dev, old_flags,
5433                                            dev->gflags ^ old_gflags);
5434         }
5435         return 0;
5436 }
5437
5438 /**
5439  *      dev_set_allmulti        - update allmulti count on a device
5440  *      @dev: device
5441  *      @inc: modifier
5442  *
5443  *      Add or remove reception of all multicast frames to a device. While the
5444  *      count in the device remains above zero the interface remains listening
5445  *      to all interfaces. Once it hits zero the device reverts back to normal
5446  *      filtering operation. A negative @inc value is used to drop the counter
5447  *      when releasing a resource needing all multicasts.
5448  *      Return 0 if successful or a negative errno code on error.
5449  */
5450
5451 int dev_set_allmulti(struct net_device *dev, int inc)
5452 {
5453         return __dev_set_allmulti(dev, inc, true);
5454 }
5455 EXPORT_SYMBOL(dev_set_allmulti);
5456
5457 /*
5458  *      Upload unicast and multicast address lists to device and
5459  *      configure RX filtering. When the device doesn't support unicast
5460  *      filtering it is put in promiscuous mode while unicast addresses
5461  *      are present.
5462  */
5463 void __dev_set_rx_mode(struct net_device *dev)
5464 {
5465         const struct net_device_ops *ops = dev->netdev_ops;
5466
5467         /* dev_open will call this function so the list will stay sane. */
5468         if (!(dev->flags&IFF_UP))
5469                 return;
5470
5471         if (!netif_device_present(dev))
5472                 return;
5473
5474         if (!(dev->priv_flags & IFF_UNICAST_FLT)) {
5475                 /* Unicast addresses changes may only happen under the rtnl,
5476                  * therefore calling __dev_set_promiscuity here is safe.
5477                  */
5478                 if (!netdev_uc_empty(dev) && !dev->uc_promisc) {
5479                         __dev_set_promiscuity(dev, 1, false);
5480                         dev->uc_promisc = true;
5481                 } else if (netdev_uc_empty(dev) && dev->uc_promisc) {
5482                         __dev_set_promiscuity(dev, -1, false);
5483                         dev->uc_promisc = false;
5484                 }
5485         }
5486
5487         if (ops->ndo_set_rx_mode)
5488                 ops->ndo_set_rx_mode(dev);
5489 }
5490
5491 void dev_set_rx_mode(struct net_device *dev)
5492 {
5493         netif_addr_lock_bh(dev);
5494         __dev_set_rx_mode(dev);
5495         netif_addr_unlock_bh(dev);
5496 }
5497
5498 /**
5499  *      dev_get_flags - get flags reported to userspace
5500  *      @dev: device
5501  *
5502  *      Get the combination of flag bits exported through APIs to userspace.
5503  */
5504 unsigned int dev_get_flags(const struct net_device *dev)
5505 {
5506         unsigned int flags;
5507
5508         flags = (dev->flags & ~(IFF_PROMISC |
5509                                 IFF_ALLMULTI |
5510                                 IFF_RUNNING |
5511                                 IFF_LOWER_UP |
5512                                 IFF_DORMANT)) |
5513                 (dev->gflags & (IFF_PROMISC |
5514                                 IFF_ALLMULTI));
5515
5516         if (netif_running(dev)) {
5517                 if (netif_oper_up(dev))
5518                         flags |= IFF_RUNNING;
5519                 if (netif_carrier_ok(dev))
5520                         flags |= IFF_LOWER_UP;
5521                 if (netif_dormant(dev))
5522                         flags |= IFF_DORMANT;
5523         }
5524
5525         return flags;
5526 }
5527 EXPORT_SYMBOL(dev_get_flags);
5528
5529 int __dev_change_flags(struct net_device *dev, unsigned int flags)
5530 {
5531         unsigned int old_flags = dev->flags;
5532         int ret;
5533
5534         ASSERT_RTNL();
5535
5536         /*
5537          *      Set the flags on our device.
5538          */
5539
5540         dev->flags = (flags & (IFF_DEBUG | IFF_NOTRAILERS | IFF_NOARP |
5541                                IFF_DYNAMIC | IFF_MULTICAST | IFF_PORTSEL |
5542                                IFF_AUTOMEDIA)) |
5543                      (dev->flags & (IFF_UP | IFF_VOLATILE | IFF_PROMISC |
5544                                     IFF_ALLMULTI));
5545
5546         /*
5547          *      Load in the correct multicast list now the flags have changed.
5548          */
5549
5550         if ((old_flags ^ flags) & IFF_MULTICAST)
5551                 dev_change_rx_flags(dev, IFF_MULTICAST);
5552
5553         dev_set_rx_mode(dev);
5554
5555         /*
5556          *      Have we downed the interface. We handle IFF_UP ourselves
5557          *      according to user attempts to set it, rather than blindly
5558          *      setting it.
5559          */
5560
5561         ret = 0;
5562         if ((old_flags ^ flags) & IFF_UP) {     /* Bit is different  ? */
5563                 ret = ((old_flags & IFF_UP) ? __dev_close : __dev_open)(dev);
5564
5565                 if (!ret)
5566                         dev_set_rx_mode(dev);
5567         }
5568
5569         if ((flags ^ dev->gflags) & IFF_PROMISC) {
5570                 int inc = (flags & IFF_PROMISC) ? 1 : -1;
5571                 unsigned int old_flags = dev->flags;
5572
5573                 dev->gflags ^= IFF_PROMISC;
5574
5575                 if (__dev_set_promiscuity(dev, inc, false) >= 0)
5576                         if (dev->flags != old_flags)
5577                                 dev_set_rx_mode(dev);
5578         }
5579
5580         /* NOTE: order of synchronization of IFF_PROMISC and IFF_ALLMULTI
5581            is important. Some (broken) drivers set IFF_PROMISC, when
5582            IFF_ALLMULTI is requested not asking us and not reporting.
5583          */
5584         if ((flags ^ dev->gflags) & IFF_ALLMULTI) {
5585                 int inc = (flags & IFF_ALLMULTI) ? 1 : -1;
5586
5587                 dev->gflags ^= IFF_ALLMULTI;
5588                 __dev_set_allmulti(dev, inc, false);
5589         }
5590
5591         return ret;
5592 }
5593
5594 void __dev_notify_flags(struct net_device *dev, unsigned int old_flags,
5595                         unsigned int gchanges)
5596 {
5597         unsigned int changes = dev->flags ^ old_flags;
5598
5599         if (gchanges)
5600                 rtmsg_ifinfo(RTM_NEWLINK, dev, gchanges, GFP_ATOMIC);
5601
5602         if (changes & IFF_UP) {
5603                 if (dev->flags & IFF_UP)
5604                         call_netdevice_notifiers(NETDEV_UP, dev);
5605                 else
5606                         call_netdevice_notifiers(NETDEV_DOWN, dev);
5607         }
5608
5609         if (dev->flags & IFF_UP &&
5610             (changes & ~(IFF_UP | IFF_PROMISC | IFF_ALLMULTI | IFF_VOLATILE))) {
5611                 struct netdev_notifier_change_info change_info;
5612
5613                 change_info.flags_changed = changes;
5614                 call_netdevice_notifiers_info(NETDEV_CHANGE, dev,
5615                                               &change_info.info);
5616         }
5617 }
5618
5619 /**
5620  *      dev_change_flags - change device settings
5621  *      @dev: device
5622  *      @flags: device state flags
5623  *
5624  *      Change settings on device based state flags. The flags are
5625  *      in the userspace exported format.
5626  */
5627 int dev_change_flags(struct net_device *dev, unsigned int flags)
5628 {
5629         int ret;
5630         unsigned int changes, old_flags = dev->flags, old_gflags = dev->gflags;
5631
5632         ret = __dev_change_flags(dev, flags);
5633         if (ret < 0)
5634                 return ret;
5635
5636         changes = (old_flags ^ dev->flags) | (old_gflags ^ dev->gflags);
5637         __dev_notify_flags(dev, old_flags, changes);
5638         return ret;
5639 }
5640 EXPORT_SYMBOL(dev_change_flags);
5641
5642 static int __dev_set_mtu(struct net_device *dev, int new_mtu)
5643 {
5644         const struct net_device_ops *ops = dev->netdev_ops;
5645
5646         if (ops->ndo_change_mtu)
5647                 return ops->ndo_change_mtu(dev, new_mtu);
5648
5649         dev->mtu = new_mtu;
5650         return 0;
5651 }
5652
5653 /**
5654  *      dev_set_mtu - Change maximum transfer unit
5655  *      @dev: device
5656  *      @new_mtu: new transfer unit
5657  *
5658  *      Change the maximum transfer size of the network device.
5659  */
5660 int dev_set_mtu(struct net_device *dev, int new_mtu)
5661 {
5662         int err, orig_mtu;
5663
5664         if (new_mtu == dev->mtu)
5665                 return 0;
5666
5667         /*      MTU must be positive.    */
5668         if (new_mtu < 0)
5669                 return -EINVAL;
5670
5671         if (!netif_device_present(dev))
5672                 return -ENODEV;
5673
5674         err = call_netdevice_notifiers(NETDEV_PRECHANGEMTU, dev);
5675         err = notifier_to_errno(err);
5676         if (err)
5677                 return err;
5678
5679         orig_mtu = dev->mtu;
5680         err = __dev_set_mtu(dev, new_mtu);
5681
5682         if (!err) {
5683                 err = call_netdevice_notifiers(NETDEV_CHANGEMTU, dev);
5684                 err = notifier_to_errno(err);
5685                 if (err) {
5686                         /* setting mtu back and notifying everyone again,
5687                          * so that they have a chance to revert changes.
5688                          */
5689                         __dev_set_mtu(dev, orig_mtu);
5690                         call_netdevice_notifiers(NETDEV_CHANGEMTU, dev);
5691                 }
5692         }
5693         return err;
5694 }
5695 EXPORT_SYMBOL(dev_set_mtu);
5696
5697 /**
5698  *      dev_set_group - Change group this device belongs to
5699  *      @dev: device
5700  *      @new_group: group this device should belong to
5701  */
5702 void dev_set_group(struct net_device *dev, int new_group)
5703 {
5704         dev->group = new_group;
5705 }
5706 EXPORT_SYMBOL(dev_set_group);
5707
5708 /**
5709  *      dev_set_mac_address - Change Media Access Control Address
5710  *      @dev: device
5711  *      @sa: new address
5712  *
5713  *      Change the hardware (MAC) address of the device
5714  */
5715 int dev_set_mac_address(struct net_device *dev, struct sockaddr *sa)
5716 {
5717         const struct net_device_ops *ops = dev->netdev_ops;
5718         int err;
5719
5720         if (!ops->ndo_set_mac_address)
5721                 return -EOPNOTSUPP;
5722         if (sa->sa_family != dev->type)
5723                 return -EINVAL;
5724         if (!netif_device_present(dev))
5725                 return -ENODEV;
5726         err = ops->ndo_set_mac_address(dev, sa);
5727         if (err)
5728                 return err;
5729         dev->addr_assign_type = NET_ADDR_SET;
5730         call_netdevice_notifiers(NETDEV_CHANGEADDR, dev);
5731         add_device_randomness(dev->dev_addr, dev->addr_len);
5732         return 0;
5733 }
5734 EXPORT_SYMBOL(dev_set_mac_address);
5735
5736 /**
5737  *      dev_change_carrier - Change device carrier
5738  *      @dev: device
5739  *      @new_carrier: new value
5740  *
5741  *      Change device carrier
5742  */
5743 int dev_change_carrier(struct net_device *dev, bool new_carrier)
5744 {
5745         const struct net_device_ops *ops = dev->netdev_ops;
5746
5747         if (!ops->ndo_change_carrier)
5748                 return -EOPNOTSUPP;
5749         if (!netif_device_present(dev))
5750                 return -ENODEV;
5751         return ops->ndo_change_carrier(dev, new_carrier);
5752 }
5753 EXPORT_SYMBOL(dev_change_carrier);
5754
5755 /**
5756  *      dev_get_phys_port_id - Get device physical port ID
5757  *      @dev: device
5758  *      @ppid: port ID
5759  *
5760  *      Get device physical port ID
5761  */
5762 int dev_get_phys_port_id(struct net_device *dev,
5763                          struct netdev_phys_port_id *ppid)
5764 {
5765         const struct net_device_ops *ops = dev->netdev_ops;
5766
5767         if (!ops->ndo_get_phys_port_id)
5768                 return -EOPNOTSUPP;
5769         return ops->ndo_get_phys_port_id(dev, ppid);
5770 }
5771 EXPORT_SYMBOL(dev_get_phys_port_id);
5772
5773 /**
5774  *      dev_new_index   -       allocate an ifindex
5775  *      @net: the applicable net namespace
5776  *
5777  *      Returns a suitable unique value for a new device interface
5778  *      number.  The caller must hold the rtnl semaphore or the
5779  *      dev_base_lock to be sure it remains unique.
5780  */
5781 static int dev_new_index(struct net *net)
5782 {
5783         int ifindex = net->ifindex;
5784         for (;;) {
5785                 if (++ifindex <= 0)
5786                         ifindex = 1;
5787                 if (!__dev_get_by_index(net, ifindex))
5788                         return net->ifindex = ifindex;
5789         }
5790 }
5791
5792 /* Delayed registration/unregisteration */
5793 static LIST_HEAD(net_todo_list);
5794 DECLARE_WAIT_QUEUE_HEAD(netdev_unregistering_wq);
5795
5796 static void net_set_todo(struct net_device *dev)
5797 {
5798         list_add_tail(&dev->todo_list, &net_todo_list);
5799         dev_net(dev)->dev_unreg_count++;
5800 }
5801
5802 static void rollback_registered_many(struct list_head *head)
5803 {
5804         struct net_device *dev, *tmp;
5805         LIST_HEAD(close_head);
5806
5807         BUG_ON(dev_boot_phase);
5808         ASSERT_RTNL();
5809
5810         list_for_each_entry_safe(dev, tmp, head, unreg_list) {
5811                 /* Some devices call without registering
5812                  * for initialization unwind. Remove those
5813                  * devices and proceed with the remaining.
5814                  */
5815                 if (dev->reg_state == NETREG_UNINITIALIZED) {
5816                         pr_debug("unregister_netdevice: device %s/%p never was registered\n",
5817                                  dev->name, dev);
5818
5819                         WARN_ON(1);
5820                         list_del(&dev->unreg_list);
5821                         continue;
5822                 }
5823                 dev->dismantle = true;
5824                 BUG_ON(dev->reg_state != NETREG_REGISTERED);
5825         }
5826
5827         /* If device is running, close it first. */
5828         list_for_each_entry(dev, head, unreg_list)
5829                 list_add_tail(&dev->close_list, &close_head);
5830         dev_close_many(&close_head);
5831
5832         list_for_each_entry(dev, head, unreg_list) {
5833                 /* And unlink it from device chain. */
5834                 unlist_netdevice(dev);
5835
5836                 dev->reg_state = NETREG_UNREGISTERING;
5837                 on_each_cpu(flush_backlog, dev, 1);
5838         }
5839
5840         synchronize_net();
5841
5842         list_for_each_entry(dev, head, unreg_list) {
5843                 /* Shutdown queueing discipline. */
5844                 dev_shutdown(dev);
5845
5846
5847                 /* Notify protocols, that we are about to destroy
5848                    this device. They should clean all the things.
5849                 */
5850                 call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
5851
5852                 /*
5853                  *      Flush the unicast and multicast chains
5854                  */
5855                 dev_uc_flush(dev);
5856                 dev_mc_flush(dev);
5857
5858                 if (dev->netdev_ops->ndo_uninit)
5859                         dev->netdev_ops->ndo_uninit(dev);
5860
5861                 if (!dev->rtnl_link_ops ||
5862                     dev->rtnl_link_state == RTNL_LINK_INITIALIZED)
5863                         rtmsg_ifinfo(RTM_DELLINK, dev, ~0U, GFP_KERNEL);
5864
5865                 /* Notifier chain MUST detach us all upper devices. */
5866                 WARN_ON(netdev_has_any_upper_dev(dev));
5867
5868                 /* Remove entries from kobject tree */
5869                 netdev_unregister_kobject(dev);
5870 #ifdef CONFIG_XPS
5871                 /* Remove XPS queueing entries */
5872                 netif_reset_xps_queues_gt(dev, 0);
5873 #endif
5874         }
5875
5876         synchronize_net();
5877
5878         list_for_each_entry(dev, head, unreg_list)
5879                 dev_put(dev);
5880 }
5881
5882 static void rollback_registered(struct net_device *dev)
5883 {
5884         LIST_HEAD(single);
5885
5886         list_add(&dev->unreg_list, &single);
5887         rollback_registered_many(&single);
5888         list_del(&single);
5889 }
5890
5891 static netdev_features_t netdev_fix_features(struct net_device *dev,
5892         netdev_features_t features)
5893 {
5894         /* Fix illegal checksum combinations */
5895         if ((features & NETIF_F_HW_CSUM) &&
5896             (features & (NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))) {
5897                 netdev_warn(dev, "mixed HW and IP checksum settings.\n");
5898                 features &= ~(NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM);
5899         }
5900
5901         /* TSO requires that SG is present as well. */
5902         if ((features & NETIF_F_ALL_TSO) && !(features & NETIF_F_SG)) {
5903                 netdev_dbg(dev, "Dropping TSO features since no SG feature.\n");
5904                 features &= ~NETIF_F_ALL_TSO;
5905         }
5906
5907         if ((features & NETIF_F_TSO) && !(features & NETIF_F_HW_CSUM) &&
5908                                         !(features & NETIF_F_IP_CSUM)) {
5909                 netdev_dbg(dev, "Dropping TSO features since no CSUM feature.\n");
5910                 features &= ~NETIF_F_TSO;
5911                 features &= ~NETIF_F_TSO_ECN;
5912         }
5913
5914         if ((features & NETIF_F_TSO6) && !(features & NETIF_F_HW_CSUM) &&
5915                                          !(features & NETIF_F_IPV6_CSUM)) {
5916                 netdev_dbg(dev, "Dropping TSO6 features since no CSUM feature.\n");
5917                 features &= ~NETIF_F_TSO6;
5918         }
5919
5920         /* TSO ECN requires that TSO is present as well. */
5921         if ((features & NETIF_F_ALL_TSO) == NETIF_F_TSO_ECN)
5922                 features &= ~NETIF_F_TSO_ECN;
5923
5924         /* Software GSO depends on SG. */
5925         if ((features & NETIF_F_GSO) && !(features & NETIF_F_SG)) {
5926                 netdev_dbg(dev, "Dropping NETIF_F_GSO since no SG feature.\n");
5927                 features &= ~NETIF_F_GSO;
5928         }
5929
5930         /* UFO needs SG and checksumming */
5931         if (features & NETIF_F_UFO) {
5932                 /* maybe split UFO into V4 and V6? */
5933                 if (!((features & NETIF_F_GEN_CSUM) ||
5934                     (features & (NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))
5935                             == (NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))) {
5936                         netdev_dbg(dev,
5937                                 "Dropping NETIF_F_UFO since no checksum offload features.\n");
5938                         features &= ~NETIF_F_UFO;
5939                 }
5940
5941                 if (!(features & NETIF_F_SG)) {
5942                         netdev_dbg(dev,
5943                                 "Dropping NETIF_F_UFO since no NETIF_F_SG feature.\n");
5944                         features &= ~NETIF_F_UFO;
5945                 }
5946         }
5947
5948 #ifdef CONFIG_NET_RX_BUSY_POLL
5949         if (dev->netdev_ops->ndo_busy_poll)
5950                 features |= NETIF_F_BUSY_POLL;
5951         else
5952 #endif
5953                 features &= ~NETIF_F_BUSY_POLL;
5954
5955         return features;
5956 }
5957
5958 int __netdev_update_features(struct net_device *dev)
5959 {
5960         netdev_features_t features;
5961         int err = 0;
5962
5963         ASSERT_RTNL();
5964
5965         features = netdev_get_wanted_features(dev);
5966
5967         if (dev->netdev_ops->ndo_fix_features)
5968                 features = dev->netdev_ops->ndo_fix_features(dev, features);
5969
5970         /* driver might be less strict about feature dependencies */
5971         features = netdev_fix_features(dev, features);
5972
5973         if (dev->features == features)
5974                 return 0;
5975
5976         netdev_dbg(dev, "Features changed: %pNF -> %pNF\n",
5977                 &dev->features, &features);
5978
5979         if (dev->netdev_ops->ndo_set_features)
5980                 err = dev->netdev_ops->ndo_set_features(dev, features);
5981
5982         if (unlikely(err < 0)) {
5983                 netdev_err(dev,
5984                         "set_features() failed (%d); wanted %pNF, left %pNF\n",
5985                         err, &features, &dev->features);
5986                 return -1;
5987         }
5988
5989         if (!err)
5990                 dev->features = features;
5991
5992         return 1;
5993 }
5994
5995 /**
5996  *      netdev_update_features - recalculate device features
5997  *      @dev: the device to check
5998  *
5999  *      Recalculate dev->features set and send notifications if it
6000  *      has changed. Should be called after driver or hardware dependent
6001  *      conditions might have changed that influence the features.
6002  */
6003 void netdev_update_features(struct net_device *dev)
6004 {
6005         if (__netdev_update_features(dev))
6006                 netdev_features_change(dev);
6007 }
6008 EXPORT_SYMBOL(netdev_update_features);
6009
6010 /**
6011  *      netdev_change_features - recalculate device features
6012  *      @dev: the device to check
6013  *
6014  *      Recalculate dev->features set and send notifications even
6015  *      if they have not changed. Should be called instead of
6016  *      netdev_update_features() if also dev->vlan_features might
6017  *      have changed to allow the changes to be propagated to stacked
6018  *      VLAN devices.
6019  */
6020 void netdev_change_features(struct net_device *dev)
6021 {
6022         __netdev_update_features(dev);
6023         netdev_features_change(dev);
6024 }
6025 EXPORT_SYMBOL(netdev_change_features);
6026
6027 /**
6028  *      netif_stacked_transfer_operstate -      transfer operstate
6029  *      @rootdev: the root or lower level device to transfer state from
6030  *      @dev: the device to transfer operstate to
6031  *
6032  *      Transfer operational state from root to device. This is normally
6033  *      called when a stacking relationship exists between the root
6034  *      device and the device(a leaf device).
6035  */
6036 void netif_stacked_transfer_operstate(const struct net_device *rootdev,
6037                                         struct net_device *dev)
6038 {
6039         if (rootdev->operstate == IF_OPER_DORMANT)
6040                 netif_dormant_on(dev);
6041         else
6042                 netif_dormant_off(dev);
6043
6044         if (netif_carrier_ok(rootdev)) {
6045                 if (!netif_carrier_ok(dev))
6046                         netif_carrier_on(dev);
6047         } else {
6048                 if (netif_carrier_ok(dev))
6049                         netif_carrier_off(dev);
6050         }
6051 }
6052 EXPORT_SYMBOL(netif_stacked_transfer_operstate);
6053
6054 #ifdef CONFIG_SYSFS
6055 static int netif_alloc_rx_queues(struct net_device *dev)
6056 {
6057         unsigned int i, count = dev->num_rx_queues;
6058         struct netdev_rx_queue *rx;
6059
6060         BUG_ON(count < 1);
6061
6062         rx = kcalloc(count, sizeof(struct netdev_rx_queue), GFP_KERNEL);
6063         if (!rx)
6064                 return -ENOMEM;
6065
6066         dev->_rx = rx;
6067
6068         for (i = 0; i < count; i++)
6069                 rx[i].dev = dev;
6070         return 0;
6071 }
6072 #endif
6073
6074 static void netdev_init_one_queue(struct net_device *dev,
6075                                   struct netdev_queue *queue, void *_unused)
6076 {
6077         /* Initialize queue lock */
6078         spin_lock_init(&queue->_xmit_lock);
6079         netdev_set_xmit_lockdep_class(&queue->_xmit_lock, dev->type);
6080         queue->xmit_lock_owner = -1;
6081         netdev_queue_numa_node_write(queue, NUMA_NO_NODE);
6082         queue->dev = dev;
6083 #ifdef CONFIG_BQL
6084         dql_init(&queue->dql, HZ);
6085 #endif
6086 }
6087
6088 static void netif_free_tx_queues(struct net_device *dev)
6089 {
6090         kvfree(dev->_tx);
6091 }
6092
6093 static int netif_alloc_netdev_queues(struct net_device *dev)
6094 {
6095         unsigned int count = dev->num_tx_queues;
6096         struct netdev_queue *tx;
6097         size_t sz = count * sizeof(*tx);
6098
6099         if (count < 1 || count > 0xffff)
6100                 return -EINVAL;
6101
6102         tx = kzalloc(sz, GFP_KERNEL | __GFP_NOWARN | __GFP_REPEAT);
6103         if (!tx) {
6104                 tx = vzalloc(sz);
6105                 if (!tx)
6106                         return -ENOMEM;
6107         }
6108         dev->_tx = tx;
6109
6110         netdev_for_each_tx_queue(dev, netdev_init_one_queue, NULL);
6111         spin_lock_init(&dev->tx_global_lock);
6112
6113         return 0;
6114 }
6115
6116 /**
6117  *      register_netdevice      - register a network device
6118  *      @dev: device to register
6119  *
6120  *      Take a completed network device structure and add it to the kernel
6121  *      interfaces. A %NETDEV_REGISTER message is sent to the netdev notifier
6122  *      chain. 0 is returned on success. A negative errno code is returned
6123  *      on a failure to set up the device, or if the name is a duplicate.
6124  *
6125  *      Callers must hold the rtnl semaphore. You may want
6126  *      register_netdev() instead of this.
6127  *
6128  *      BUGS:
6129  *      The locking appears insufficient to guarantee two parallel registers
6130  *      will not get the same name.
6131  */
6132
6133 int register_netdevice(struct net_device *dev)
6134 {
6135         int ret;
6136         struct net *net = dev_net(dev);
6137
6138         BUG_ON(dev_boot_phase);
6139         ASSERT_RTNL();
6140
6141         might_sleep();
6142
6143         /* When net_device's are persistent, this will be fatal. */
6144         BUG_ON(dev->reg_state != NETREG_UNINITIALIZED);
6145         BUG_ON(!net);
6146
6147         spin_lock_init(&dev->addr_list_lock);
6148         netdev_set_addr_lockdep_class(dev);
6149
6150         dev->iflink = -1;
6151
6152         ret = dev_get_valid_name(net, dev, dev->name);
6153         if (ret < 0)
6154                 goto out;
6155
6156         /* Init, if this function is available */
6157         if (dev->netdev_ops->ndo_init) {
6158                 ret = dev->netdev_ops->ndo_init(dev);
6159                 if (ret) {
6160                         if (ret > 0)
6161                                 ret = -EIO;
6162                         goto out;
6163                 }
6164         }
6165
6166         if (((dev->hw_features | dev->features) &
6167              NETIF_F_HW_VLAN_CTAG_FILTER) &&
6168             (!dev->netdev_ops->ndo_vlan_rx_add_vid ||
6169              !dev->netdev_ops->ndo_vlan_rx_kill_vid)) {
6170                 netdev_WARN(dev, "Buggy VLAN acceleration in driver!\n");
6171                 ret = -EINVAL;
6172                 goto err_uninit;
6173         }
6174
6175         ret = -EBUSY;
6176         if (!dev->ifindex)
6177                 dev->ifindex = dev_new_index(net);
6178         else if (__dev_get_by_index(net, dev->ifindex))
6179                 goto err_uninit;
6180
6181         if (dev->iflink == -1)
6182                 dev->iflink = dev->ifindex;
6183
6184         /* Transfer changeable features to wanted_features and enable
6185          * software offloads (GSO and GRO).
6186          */
6187         dev->hw_features |= NETIF_F_SOFT_FEATURES;
6188         dev->features |= NETIF_F_SOFT_FEATURES;
6189         dev->wanted_features = dev->features & dev->hw_features;
6190
6191         if (!(dev->flags & IFF_LOOPBACK)) {
6192                 dev->hw_features |= NETIF_F_NOCACHE_COPY;
6193         }
6194
6195         /* Make NETIF_F_HIGHDMA inheritable to VLAN devices.
6196          */
6197         dev->vlan_features |= NETIF_F_HIGHDMA;
6198
6199         /* Make NETIF_F_SG inheritable to tunnel devices.
6200          */
6201         dev->hw_enc_features |= NETIF_F_SG;
6202
6203         /* Make NETIF_F_SG inheritable to MPLS.
6204          */
6205         dev->mpls_features |= NETIF_F_SG;
6206
6207         ret = call_netdevice_notifiers(NETDEV_POST_INIT, dev);
6208         ret = notifier_to_errno(ret);
6209         if (ret)
6210                 goto err_uninit;
6211
6212         ret = netdev_register_kobject(dev);
6213         if (ret)
6214                 goto err_uninit;
6215         dev->reg_state = NETREG_REGISTERED;
6216
6217         __netdev_update_features(dev);
6218
6219         /*
6220          *      Default initial state at registry is that the
6221          *      device is present.
6222          */
6223
6224         set_bit(__LINK_STATE_PRESENT, &dev->state);
6225
6226         linkwatch_init_dev(dev);
6227
6228         dev_init_scheduler(dev);
6229         dev_hold(dev);
6230         list_netdevice(dev);
6231         add_device_randomness(dev->dev_addr, dev->addr_len);
6232
6233         /* If the device has permanent device address, driver should
6234          * set dev_addr and also addr_assign_type should be set to
6235          * NET_ADDR_PERM (default value).
6236          */
6237         if (dev->addr_assign_type == NET_ADDR_PERM)
6238                 memcpy(dev->perm_addr, dev->dev_addr, dev->addr_len);
6239
6240         /* Notify protocols, that a new device appeared. */
6241         ret = call_netdevice_notifiers(NETDEV_REGISTER, dev);
6242         ret = notifier_to_errno(ret);
6243         if (ret) {
6244                 rollback_registered(dev);
6245                 dev->reg_state = NETREG_UNREGISTERED;
6246         }
6247         /*
6248          *      Prevent userspace races by waiting until the network
6249          *      device is fully setup before sending notifications.
6250          */
6251         if (!dev->rtnl_link_ops ||
6252             dev->rtnl_link_state == RTNL_LINK_INITIALIZED)
6253                 rtmsg_ifinfo(RTM_NEWLINK, dev, ~0U, GFP_KERNEL);
6254
6255 out:
6256         return ret;
6257
6258 err_uninit:
6259         if (dev->netdev_ops->ndo_uninit)
6260                 dev->netdev_ops->ndo_uninit(dev);
6261         goto out;
6262 }
6263 EXPORT_SYMBOL(register_netdevice);
6264
6265 /**
6266  *      init_dummy_netdev       - init a dummy network device for NAPI
6267  *      @dev: device to init
6268  *
6269  *      This takes a network device structure and initialize the minimum
6270  *      amount of fields so it can be used to schedule NAPI polls without
6271  *      registering a full blown interface. This is to be used by drivers
6272  *      that need to tie several hardware interfaces to a single NAPI
6273  *      poll scheduler due to HW limitations.
6274  */
6275 int init_dummy_netdev(struct net_device *dev)
6276 {
6277         /* Clear everything. Note we don't initialize spinlocks
6278          * are they aren't supposed to be taken by any of the
6279          * NAPI code and this dummy netdev is supposed to be
6280          * only ever used for NAPI polls
6281          */
6282         memset(dev, 0, sizeof(struct net_device));
6283
6284         /* make sure we BUG if trying to hit standard
6285          * register/unregister code path
6286          */
6287         dev->reg_state = NETREG_DUMMY;
6288
6289         /* NAPI wants this */
6290         INIT_LIST_HEAD(&dev->napi_list);
6291
6292         /* a dummy interface is started by default */
6293         set_bit(__LINK_STATE_PRESENT, &dev->state);
6294         set_bit(__LINK_STATE_START, &dev->state);
6295
6296         /* Note : We dont allocate pcpu_refcnt for dummy devices,
6297          * because users of this 'device' dont need to change
6298          * its refcount.
6299          */
6300
6301         return 0;
6302 }
6303 EXPORT_SYMBOL_GPL(init_dummy_netdev);
6304
6305
6306 /**
6307  *      register_netdev - register a network device
6308  *      @dev: device to register
6309  *
6310  *      Take a completed network device structure and add it to the kernel
6311  *      interfaces. A %NETDEV_REGISTER message is sent to the netdev notifier
6312  *      chain. 0 is returned on success. A negative errno code is returned
6313  *      on a failure to set up the device, or if the name is a duplicate.
6314  *
6315  *      This is a wrapper around register_netdevice that takes the rtnl semaphore
6316  *      and expands the device name if you passed a format string to
6317  *      alloc_netdev.
6318  */
6319 int register_netdev(struct net_device *dev)
6320 {
6321         int err;
6322
6323         rtnl_lock();
6324         err = register_netdevice(dev);
6325         rtnl_unlock();
6326         return err;
6327 }
6328 EXPORT_SYMBOL(register_netdev);
6329
6330 int netdev_refcnt_read(const struct net_device *dev)
6331 {
6332         int i, refcnt = 0;
6333
6334         for_each_possible_cpu(i)
6335                 refcnt += *per_cpu_ptr(dev->pcpu_refcnt, i);
6336         return refcnt;
6337 }
6338 EXPORT_SYMBOL(netdev_refcnt_read);
6339
6340 /**
6341  * netdev_wait_allrefs - wait until all references are gone.
6342  * @dev: target net_device
6343  *
6344  * This is called when unregistering network devices.
6345  *
6346  * Any protocol or device that holds a reference should register
6347  * for netdevice notification, and cleanup and put back the
6348  * reference if they receive an UNREGISTER event.
6349  * We can get stuck here if buggy protocols don't correctly
6350  * call dev_put.
6351  */
6352 static void netdev_wait_allrefs(struct net_device *dev)
6353 {
6354         unsigned long rebroadcast_time, warning_time;
6355         int refcnt;
6356
6357         linkwatch_forget_dev(dev);
6358
6359         rebroadcast_time = warning_time = jiffies;
6360         refcnt = netdev_refcnt_read(dev);
6361
6362         while (refcnt != 0) {
6363                 if (time_after(jiffies, rebroadcast_time + 1 * HZ)) {
6364                         rtnl_lock();
6365
6366                         /* Rebroadcast unregister notification */
6367                         call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
6368
6369                         __rtnl_unlock();
6370                         rcu_barrier();
6371                         rtnl_lock();
6372
6373                         call_netdevice_notifiers(NETDEV_UNREGISTER_FINAL, dev);
6374                         if (test_bit(__LINK_STATE_LINKWATCH_PENDING,
6375                                      &dev->state)) {
6376                                 /* We must not have linkwatch events
6377                                  * pending on unregister. If this
6378                                  * happens, we simply run the queue
6379                                  * unscheduled, resulting in a noop
6380                                  * for this device.
6381                                  */
6382                                 linkwatch_run_queue();
6383                         }
6384
6385                         __rtnl_unlock();
6386
6387                         rebroadcast_time = jiffies;
6388                 }
6389
6390                 msleep(250);
6391
6392                 refcnt = netdev_refcnt_read(dev);
6393
6394                 if (time_after(jiffies, warning_time + 10 * HZ)) {
6395                         pr_emerg("unregister_netdevice: waiting for %s to become free. Usage count = %d\n",
6396                                  dev->name, refcnt);
6397                         warning_time = jiffies;
6398                 }
6399         }
6400 }
6401
6402 /* The sequence is:
6403  *
6404  *      rtnl_lock();
6405  *      ...
6406  *      register_netdevice(x1);
6407  *      register_netdevice(x2);
6408  *      ...
6409  *      unregister_netdevice(y1);
6410  *      unregister_netdevice(y2);
6411  *      ...
6412  *      rtnl_unlock();
6413  *      free_netdev(y1);
6414  *      free_netdev(y2);
6415  *
6416  * We are invoked by rtnl_unlock().
6417  * This allows us to deal with problems:
6418  * 1) We can delete sysfs objects which invoke hotplug
6419  *    without deadlocking with linkwatch via keventd.
6420  * 2) Since we run with the RTNL semaphore not held, we can sleep
6421  *    safely in order to wait for the netdev refcnt to drop to zero.
6422  *
6423  * We must not return until all unregister events added during
6424  * the interval the lock was held have been completed.
6425  */
6426 void netdev_run_todo(void)
6427 {
6428         struct list_head list;
6429
6430         /* Snapshot list, allow later requests */
6431         list_replace_init(&net_todo_list, &list);
6432
6433         __rtnl_unlock();
6434
6435
6436         /* Wait for rcu callbacks to finish before next phase */
6437         if (!list_empty(&list))
6438                 rcu_barrier();
6439
6440         while (!list_empty(&list)) {
6441                 struct net_device *dev
6442                         = list_first_entry(&list, struct net_device, todo_list);
6443                 list_del(&dev->todo_list);
6444
6445                 rtnl_lock();
6446                 call_netdevice_notifiers(NETDEV_UNREGISTER_FINAL, dev);
6447                 __rtnl_unlock();
6448
6449                 if (unlikely(dev->reg_state != NETREG_UNREGISTERING)) {
6450                         pr_err("network todo '%s' but state %d\n",
6451                                dev->name, dev->reg_state);
6452                         dump_stack();
6453                         continue;
6454                 }
6455
6456                 dev->reg_state = NETREG_UNREGISTERED;
6457
6458                 netdev_wait_allrefs(dev);
6459
6460                 /* paranoia */
6461                 BUG_ON(netdev_refcnt_read(dev));
6462                 WARN_ON(rcu_access_pointer(dev->ip_ptr));
6463                 WARN_ON(rcu_access_pointer(dev->ip6_ptr));
6464                 WARN_ON(dev->dn_ptr);
6465
6466                 if (dev->destructor)
6467                         dev->destructor(dev);
6468
6469                 /* Report a network device has been unregistered */
6470                 rtnl_lock();
6471                 dev_net(dev)->dev_unreg_count--;
6472                 __rtnl_unlock();
6473                 wake_up(&netdev_unregistering_wq);
6474
6475                 /* Free network device */
6476                 kobject_put(&dev->dev.kobj);
6477         }
6478 }
6479
6480 /* Convert net_device_stats to rtnl_link_stats64.  They have the same
6481  * fields in the same order, with only the type differing.
6482  */
6483 void netdev_stats_to_stats64(struct rtnl_link_stats64 *stats64,
6484                              const struct net_device_stats *netdev_stats)
6485 {
6486 #if BITS_PER_LONG == 64
6487         BUILD_BUG_ON(sizeof(*stats64) != sizeof(*netdev_stats));
6488         memcpy(stats64, netdev_stats, sizeof(*stats64));
6489 #else
6490         size_t i, n = sizeof(*stats64) / sizeof(u64);
6491         const unsigned long *src = (const unsigned long *)netdev_stats;
6492         u64 *dst = (u64 *)stats64;
6493
6494         BUILD_BUG_ON(sizeof(*netdev_stats) / sizeof(unsigned long) !=
6495                      sizeof(*stats64) / sizeof(u64));
6496         for (i = 0; i < n; i++)
6497                 dst[i] = src[i];
6498 #endif
6499 }
6500 EXPORT_SYMBOL(netdev_stats_to_stats64);
6501
6502 /**
6503  *      dev_get_stats   - get network device statistics
6504  *      @dev: device to get statistics from
6505  *      @storage: place to store stats
6506  *
6507  *      Get network statistics from device. Return @storage.
6508  *      The device driver may provide its own method by setting
6509  *      dev->netdev_ops->get_stats64 or dev->netdev_ops->get_stats;
6510  *      otherwise the internal statistics structure is used.
6511  */
6512 struct rtnl_link_stats64 *dev_get_stats(struct net_device *dev,
6513                                         struct rtnl_link_stats64 *storage)
6514 {
6515         const struct net_device_ops *ops = dev->netdev_ops;
6516
6517         if (ops->ndo_get_stats64) {
6518                 memset(storage, 0, sizeof(*storage));
6519                 ops->ndo_get_stats64(dev, storage);
6520         } else if (ops->ndo_get_stats) {
6521                 netdev_stats_to_stats64(storage, ops->ndo_get_stats(dev));
6522         } else {
6523                 netdev_stats_to_stats64(storage, &dev->stats);
6524         }
6525         storage->rx_dropped += (unsigned long)atomic_long_read(&dev->rx_dropped);
6526         storage->tx_dropped += (unsigned long)atomic_long_read(&dev->tx_dropped);
6527         return storage;
6528 }
6529 EXPORT_SYMBOL(dev_get_stats);
6530
6531 struct netdev_queue *dev_ingress_queue_create(struct net_device *dev)
6532 {
6533         struct netdev_queue *queue = dev_ingress_queue(dev);
6534
6535 #ifdef CONFIG_NET_CLS_ACT
6536         if (queue)
6537                 return queue;
6538         queue = kzalloc(sizeof(*queue), GFP_KERNEL);
6539         if (!queue)
6540                 return NULL;
6541         netdev_init_one_queue(dev, queue, NULL);
6542         queue->qdisc = &noop_qdisc;
6543         queue->qdisc_sleeping = &noop_qdisc;
6544         rcu_assign_pointer(dev->ingress_queue, queue);
6545 #endif
6546         return queue;
6547 }
6548
6549 static const struct ethtool_ops default_ethtool_ops;
6550
6551 void netdev_set_default_ethtool_ops(struct net_device *dev,
6552                                     const struct ethtool_ops *ops)
6553 {
6554         if (dev->ethtool_ops == &default_ethtool_ops)
6555                 dev->ethtool_ops = ops;
6556 }
6557 EXPORT_SYMBOL_GPL(netdev_set_default_ethtool_ops);
6558
6559 void netdev_freemem(struct net_device *dev)
6560 {
6561         char *addr = (char *)dev - dev->padded;
6562
6563         kvfree(addr);
6564 }
6565
6566 /**
6567  *      alloc_netdev_mqs - allocate network device
6568  *      @sizeof_priv:   size of private data to allocate space for
6569  *      @name:          device name format string
6570  *      @setup:         callback to initialize device
6571  *      @txqs:          the number of TX subqueues to allocate
6572  *      @rxqs:          the number of RX subqueues to allocate
6573  *
6574  *      Allocates a struct net_device with private data area for driver use
6575  *      and performs basic initialization.  Also allocates subqueue structs
6576  *      for each queue on the device.
6577  */
6578 struct net_device *alloc_netdev_mqs(int sizeof_priv, const char *name,
6579                 void (*setup)(struct net_device *),
6580                 unsigned int txqs, unsigned int rxqs)
6581 {
6582         struct net_device *dev;
6583         size_t alloc_size;
6584         struct net_device *p;
6585
6586         BUG_ON(strlen(name) >= sizeof(dev->name));
6587
6588         if (txqs < 1) {
6589                 pr_err("alloc_netdev: Unable to allocate device with zero queues\n");
6590                 return NULL;
6591         }
6592
6593 #ifdef CONFIG_SYSFS
6594         if (rxqs < 1) {
6595                 pr_err("alloc_netdev: Unable to allocate device with zero RX queues\n");
6596                 return NULL;
6597         }
6598 #endif
6599
6600         alloc_size = sizeof(struct net_device);
6601         if (sizeof_priv) {
6602                 /* ensure 32-byte alignment of private area */
6603                 alloc_size = ALIGN(alloc_size, NETDEV_ALIGN);
6604                 alloc_size += sizeof_priv;
6605         }
6606         /* ensure 32-byte alignment of whole construct */
6607         alloc_size += NETDEV_ALIGN - 1;
6608
6609         p = kzalloc(alloc_size, GFP_KERNEL | __GFP_NOWARN | __GFP_REPEAT);
6610         if (!p)
6611                 p = vzalloc(alloc_size);
6612         if (!p)
6613                 return NULL;
6614
6615         dev = PTR_ALIGN(p, NETDEV_ALIGN);
6616         dev->padded = (char *)dev - (char *)p;
6617
6618         dev->pcpu_refcnt = alloc_percpu(int);
6619         if (!dev->pcpu_refcnt)
6620                 goto free_dev;
6621
6622         if (dev_addr_init(dev))
6623                 goto free_pcpu;
6624
6625         dev_mc_init(dev);
6626         dev_uc_init(dev);
6627
6628         dev_net_set(dev, &init_net);
6629
6630         dev->gso_max_size = GSO_MAX_SIZE;
6631         dev->gso_max_segs = GSO_MAX_SEGS;
6632
6633         INIT_LIST_HEAD(&dev->napi_list);
6634         INIT_LIST_HEAD(&dev->unreg_list);
6635         INIT_LIST_HEAD(&dev->close_list);
6636         INIT_LIST_HEAD(&dev->link_watch_list);
6637         INIT_LIST_HEAD(&dev->adj_list.upper);
6638         INIT_LIST_HEAD(&dev->adj_list.lower);
6639         INIT_LIST_HEAD(&dev->all_adj_list.upper);
6640         INIT_LIST_HEAD(&dev->all_adj_list.lower);
6641         dev->priv_flags = IFF_XMIT_DST_RELEASE;
6642         setup(dev);
6643
6644         dev->num_tx_queues = txqs;
6645         dev->real_num_tx_queues = txqs;
6646         if (netif_alloc_netdev_queues(dev))
6647                 goto free_all;
6648
6649 #ifdef CONFIG_SYSFS
6650         dev->num_rx_queues = rxqs;
6651         dev->real_num_rx_queues = rxqs;
6652         if (netif_alloc_rx_queues(dev))
6653                 goto free_all;
6654 #endif
6655
6656         strcpy(dev->name, name);
6657         dev->group = INIT_NETDEV_GROUP;
6658         if (!dev->ethtool_ops)
6659                 dev->ethtool_ops = &default_ethtool_ops;
6660         return dev;
6661
6662 free_all:
6663         free_netdev(dev);
6664         return NULL;
6665
6666 free_pcpu:
6667         free_percpu(dev->pcpu_refcnt);
6668 free_dev:
6669         netdev_freemem(dev);
6670         return NULL;
6671 }
6672 EXPORT_SYMBOL(alloc_netdev_mqs);
6673
6674 /**
6675  *      free_netdev - free network device
6676  *      @dev: device
6677  *
6678  *      This function does the last stage of destroying an allocated device
6679  *      interface. The reference to the device object is released.
6680  *      If this is the last reference then it will be freed.
6681  */
6682 void free_netdev(struct net_device *dev)
6683 {
6684         struct napi_struct *p, *n;
6685
6686         release_net(dev_net(dev));
6687
6688         netif_free_tx_queues(dev);
6689 #ifdef CONFIG_SYSFS
6690         kfree(dev->_rx);
6691 #endif
6692
6693         kfree(rcu_dereference_protected(dev->ingress_queue, 1));
6694
6695         /* Flush device addresses */
6696         dev_addr_flush(dev);
6697
6698         list_for_each_entry_safe(p, n, &dev->napi_list, dev_list)
6699                 netif_napi_del(p);
6700
6701         free_percpu(dev->pcpu_refcnt);
6702         dev->pcpu_refcnt = NULL;
6703
6704         /*  Compatibility with error handling in drivers */
6705         if (dev->reg_state == NETREG_UNINITIALIZED) {
6706                 netdev_freemem(dev);
6707                 return;
6708         }
6709
6710         BUG_ON(dev->reg_state != NETREG_UNREGISTERED);
6711         dev->reg_state = NETREG_RELEASED;
6712
6713         /* will free via device release */
6714         put_device(&dev->dev);
6715 }
6716 EXPORT_SYMBOL(free_netdev);
6717
6718 /**
6719  *      synchronize_net -  Synchronize with packet receive processing
6720  *
6721  *      Wait for packets currently being received to be done.
6722  *      Does not block later packets from starting.
6723  */
6724 void synchronize_net(void)
6725 {
6726         might_sleep();
6727         if (rtnl_is_locked())
6728                 synchronize_rcu_expedited();
6729         else
6730                 synchronize_rcu();
6731 }
6732 EXPORT_SYMBOL(synchronize_net);
6733
6734 /**
6735  *      unregister_netdevice_queue - remove device from the kernel
6736  *      @dev: device
6737  *      @head: list
6738  *
6739  *      This function shuts down a device interface and removes it
6740  *      from the kernel tables.
6741  *      If head not NULL, device is queued to be unregistered later.
6742  *
6743  *      Callers must hold the rtnl semaphore.  You may want
6744  *      unregister_netdev() instead of this.
6745  */
6746
6747 void unregister_netdevice_queue(struct net_device *dev, struct list_head *head)
6748 {
6749         ASSERT_RTNL();
6750
6751         if (head) {
6752                 list_move_tail(&dev->unreg_list, head);
6753         } else {
6754                 rollback_registered(dev);
6755                 /* Finish processing unregister after unlock */
6756                 net_set_todo(dev);
6757         }
6758 }
6759 EXPORT_SYMBOL(unregister_netdevice_queue);
6760
6761 /**
6762  *      unregister_netdevice_many - unregister many devices
6763  *      @head: list of devices
6764  *
6765  *  Note: As most callers use a stack allocated list_head,
6766  *  we force a list_del() to make sure stack wont be corrupted later.
6767  */
6768 void unregister_netdevice_many(struct list_head *head)
6769 {
6770         struct net_device *dev;
6771
6772         if (!list_empty(head)) {
6773                 rollback_registered_many(head);
6774                 list_for_each_entry(dev, head, unreg_list)
6775                         net_set_todo(dev);
6776                 list_del(head);
6777         }
6778 }
6779 EXPORT_SYMBOL(unregister_netdevice_many);
6780
6781 /**
6782  *      unregister_netdev - remove device from the kernel
6783  *      @dev: device
6784  *
6785  *      This function shuts down a device interface and removes it
6786  *      from the kernel tables.
6787  *
6788  *      This is just a wrapper for unregister_netdevice that takes
6789  *      the rtnl semaphore.  In general you want to use this and not
6790  *      unregister_netdevice.
6791  */
6792 void unregister_netdev(struct net_device *dev)
6793 {
6794         rtnl_lock();
6795         unregister_netdevice(dev);
6796         rtnl_unlock();
6797 }
6798 EXPORT_SYMBOL(unregister_netdev);
6799
6800 /**
6801  *      dev_change_net_namespace - move device to different nethost namespace
6802  *      @dev: device
6803  *      @net: network namespace
6804  *      @pat: If not NULL name pattern to try if the current device name
6805  *            is already taken in the destination network namespace.
6806  *
6807  *      This function shuts down a device interface and moves it
6808  *      to a new network namespace. On success 0 is returned, on
6809  *      a failure a netagive errno code is returned.
6810  *
6811  *      Callers must hold the rtnl semaphore.
6812  */
6813
6814 int dev_change_net_namespace(struct net_device *dev, struct net *net, const char *pat)
6815 {
6816         int err;
6817
6818         ASSERT_RTNL();
6819
6820         /* Don't allow namespace local devices to be moved. */
6821         err = -EINVAL;
6822         if (dev->features & NETIF_F_NETNS_LOCAL)
6823                 goto out;
6824
6825         /* Ensure the device has been registrered */
6826         if (dev->reg_state != NETREG_REGISTERED)
6827                 goto out;
6828
6829         /* Get out if there is nothing todo */
6830         err = 0;
6831         if (net_eq(dev_net(dev), net))
6832                 goto out;
6833
6834         /* Pick the destination device name, and ensure
6835          * we can use it in the destination network namespace.
6836          */
6837         err = -EEXIST;
6838         if (__dev_get_by_name(net, dev->name)) {
6839                 /* We get here if we can't use the current device name */
6840                 if (!pat)
6841                         goto out;
6842                 if (dev_get_valid_name(net, dev, pat) < 0)
6843                         goto out;
6844         }
6845
6846         /*
6847          * And now a mini version of register_netdevice unregister_netdevice.
6848          */
6849
6850         /* If device is running close it first. */
6851         dev_close(dev);
6852
6853         /* And unlink it from device chain */
6854         err = -ENODEV;
6855         unlist_netdevice(dev);
6856
6857         synchronize_net();
6858
6859         /* Shutdown queueing discipline. */
6860         dev_shutdown(dev);
6861
6862         /* Notify protocols, that we are about to destroy
6863            this device. They should clean all the things.
6864
6865            Note that dev->reg_state stays at NETREG_REGISTERED.
6866            This is wanted because this way 8021q and macvlan know
6867            the device is just moving and can keep their slaves up.
6868         */
6869         call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
6870         rcu_barrier();
6871         call_netdevice_notifiers(NETDEV_UNREGISTER_FINAL, dev);
6872         rtmsg_ifinfo(RTM_DELLINK, dev, ~0U, GFP_KERNEL);
6873
6874         /*
6875          *      Flush the unicast and multicast chains
6876          */
6877         dev_uc_flush(dev);
6878         dev_mc_flush(dev);
6879
6880         /* Send a netdev-removed uevent to the old namespace */
6881         kobject_uevent(&dev->dev.kobj, KOBJ_REMOVE);
6882         netdev_adjacent_del_links(dev);
6883
6884         /* Actually switch the network namespace */
6885         dev_net_set(dev, net);
6886
6887         /* If there is an ifindex conflict assign a new one */
6888         if (__dev_get_by_index(net, dev->ifindex)) {
6889                 int iflink = (dev->iflink == dev->ifindex);
6890                 dev->ifindex = dev_new_index(net);
6891                 if (iflink)
6892                         dev->iflink = dev->ifindex;
6893         }
6894
6895         /* Send a netdev-add uevent to the new namespace */
6896         kobject_uevent(&dev->dev.kobj, KOBJ_ADD);
6897         netdev_adjacent_add_links(dev);
6898
6899         /* Fixup kobjects */
6900         err = device_rename(&dev->dev, dev->name);
6901         WARN_ON(err);
6902
6903         /* Add the device back in the hashes */
6904         list_netdevice(dev);
6905
6906         /* Notify protocols, that a new device appeared. */
6907         call_netdevice_notifiers(NETDEV_REGISTER, dev);
6908
6909         /*
6910          *      Prevent userspace races by waiting until the network
6911          *      device is fully setup before sending notifications.
6912          */
6913         rtmsg_ifinfo(RTM_NEWLINK, dev, ~0U, GFP_KERNEL);
6914
6915         synchronize_net();
6916         err = 0;
6917 out:
6918         return err;
6919 }
6920 EXPORT_SYMBOL_GPL(dev_change_net_namespace);
6921
6922 static int dev_cpu_callback(struct notifier_block *nfb,
6923                             unsigned long action,
6924                             void *ocpu)
6925 {
6926         struct sk_buff **list_skb;
6927         struct sk_buff *skb;
6928         unsigned int cpu, oldcpu = (unsigned long)ocpu;
6929         struct softnet_data *sd, *oldsd;
6930
6931         if (action != CPU_DEAD && action != CPU_DEAD_FROZEN)
6932                 return NOTIFY_OK;
6933
6934         local_irq_disable();
6935         cpu = smp_processor_id();
6936         sd = &per_cpu(softnet_data, cpu);
6937         oldsd = &per_cpu(softnet_data, oldcpu);
6938
6939         /* Find end of our completion_queue. */
6940         list_skb = &sd->completion_queue;
6941         while (*list_skb)
6942                 list_skb = &(*list_skb)->next;
6943         /* Append completion queue from offline CPU. */
6944         *list_skb = oldsd->completion_queue;
6945         oldsd->completion_queue = NULL;
6946
6947         /* Append output queue from offline CPU. */
6948         if (oldsd->output_queue) {
6949                 *sd->output_queue_tailp = oldsd->output_queue;
6950                 sd->output_queue_tailp = oldsd->output_queue_tailp;
6951                 oldsd->output_queue = NULL;
6952                 oldsd->output_queue_tailp = &oldsd->output_queue;
6953         }
6954         /* Append NAPI poll list from offline CPU, with one exception :
6955          * process_backlog() must be called by cpu owning percpu backlog.
6956          * We properly handle process_queue & input_pkt_queue later.
6957          */
6958         while (!list_empty(&oldsd->poll_list)) {
6959                 struct napi_struct *napi = list_first_entry(&oldsd->poll_list,
6960                                                             struct napi_struct,
6961                                                             poll_list);
6962
6963                 list_del_init(&napi->poll_list);
6964                 if (napi->poll == process_backlog)
6965                         napi->state = 0;
6966                 else
6967                         ____napi_schedule(sd, napi);
6968         }
6969
6970         raise_softirq_irqoff(NET_TX_SOFTIRQ);
6971         local_irq_enable();
6972
6973         /* Process offline CPU's input_pkt_queue */
6974         while ((skb = __skb_dequeue(&oldsd->process_queue))) {
6975                 netif_rx_internal(skb);
6976                 input_queue_head_incr(oldsd);
6977         }
6978         while ((skb = skb_dequeue(&oldsd->input_pkt_queue))) {
6979                 netif_rx_internal(skb);
6980                 input_queue_head_incr(oldsd);
6981         }
6982
6983         return NOTIFY_OK;
6984 }
6985
6986
6987 /**
6988  *      netdev_increment_features - increment feature set by one
6989  *      @all: current feature set
6990  *      @one: new feature set
6991  *      @mask: mask feature set
6992  *
6993  *      Computes a new feature set after adding a device with feature set
6994  *      @one to the master device with current feature set @all.  Will not
6995  *      enable anything that is off in @mask. Returns the new feature set.
6996  */
6997 netdev_features_t netdev_increment_features(netdev_features_t all,
6998         netdev_features_t one, netdev_features_t mask)
6999 {
7000         if (mask & NETIF_F_GEN_CSUM)
7001                 mask |= NETIF_F_ALL_CSUM;
7002         mask |= NETIF_F_VLAN_CHALLENGED;
7003
7004         all |= one & (NETIF_F_ONE_FOR_ALL|NETIF_F_ALL_CSUM) & mask;
7005         all &= one | ~NETIF_F_ALL_FOR_ALL;
7006
7007         /* If one device supports hw checksumming, set for all. */
7008         if (all & NETIF_F_GEN_CSUM)
7009                 all &= ~(NETIF_F_ALL_CSUM & ~NETIF_F_GEN_CSUM);
7010
7011         return all;
7012 }
7013 EXPORT_SYMBOL(netdev_increment_features);
7014
7015 static struct hlist_head * __net_init netdev_create_hash(void)
7016 {
7017         int i;
7018         struct hlist_head *hash;
7019
7020         hash = kmalloc(sizeof(*hash) * NETDEV_HASHENTRIES, GFP_KERNEL);
7021         if (hash != NULL)
7022                 for (i = 0; i < NETDEV_HASHENTRIES; i++)
7023                         INIT_HLIST_HEAD(&hash[i]);
7024
7025         return hash;
7026 }
7027
7028 /* Initialize per network namespace state */
7029 static int __net_init netdev_init(struct net *net)
7030 {
7031         if (net != &init_net)
7032                 INIT_LIST_HEAD(&net->dev_base_head);
7033
7034         net->dev_name_head = netdev_create_hash();
7035         if (net->dev_name_head == NULL)
7036                 goto err_name;
7037
7038         net->dev_index_head = netdev_create_hash();
7039         if (net->dev_index_head == NULL)
7040                 goto err_idx;
7041
7042         return 0;
7043
7044 err_idx:
7045         kfree(net->dev_name_head);
7046 err_name:
7047         return -ENOMEM;
7048 }
7049
7050 /**
7051  *      netdev_drivername - network driver for the device
7052  *      @dev: network device
7053  *
7054  *      Determine network driver for device.
7055  */
7056 const char *netdev_drivername(const struct net_device *dev)
7057 {
7058         const struct device_driver *driver;
7059         const struct device *parent;
7060         const char *empty = "";
7061
7062         parent = dev->dev.parent;
7063         if (!parent)
7064                 return empty;
7065
7066         driver = parent->driver;
7067         if (driver && driver->name)
7068                 return driver->name;
7069         return empty;
7070 }
7071
7072 static int __netdev_printk(const char *level, const struct net_device *dev,
7073                            struct va_format *vaf)
7074 {
7075         int r;
7076
7077         if (dev && dev->dev.parent) {
7078                 r = dev_printk_emit(level[1] - '0',
7079                                     dev->dev.parent,
7080                                     "%s %s %s: %pV",
7081                                     dev_driver_string(dev->dev.parent),
7082                                     dev_name(dev->dev.parent),
7083                                     netdev_name(dev), vaf);
7084         } else if (dev) {
7085                 r = printk("%s%s: %pV", level, netdev_name(dev), vaf);
7086         } else {
7087                 r = printk("%s(NULL net_device): %pV", level, vaf);
7088         }
7089
7090         return r;
7091 }
7092
7093 int netdev_printk(const char *level, const struct net_device *dev,
7094                   const char *format, ...)
7095 {
7096         struct va_format vaf;
7097         va_list args;
7098         int r;
7099
7100         va_start(args, format);
7101
7102         vaf.fmt = format;
7103         vaf.va = &args;
7104
7105         r = __netdev_printk(level, dev, &vaf);
7106
7107         va_end(args);
7108
7109         return r;
7110 }
7111 EXPORT_SYMBOL(netdev_printk);
7112
7113 #define define_netdev_printk_level(func, level)                 \
7114 int func(const struct net_device *dev, const char *fmt, ...)    \
7115 {                                                               \
7116         int r;                                                  \
7117         struct va_format vaf;                                   \
7118         va_list args;                                           \
7119                                                                 \
7120         va_start(args, fmt);                                    \
7121                                                                 \
7122         vaf.fmt = fmt;                                          \
7123         vaf.va = &args;                                         \
7124                                                                 \
7125         r = __netdev_printk(level, dev, &vaf);                  \
7126                                                                 \
7127         va_end(args);                                           \
7128                                                                 \
7129         return r;                                               \
7130 }                                                               \
7131 EXPORT_SYMBOL(func);
7132
7133 define_netdev_printk_level(netdev_emerg, KERN_EMERG);
7134 define_netdev_printk_level(netdev_alert, KERN_ALERT);
7135 define_netdev_printk_level(netdev_crit, KERN_CRIT);
7136 define_netdev_printk_level(netdev_err, KERN_ERR);
7137 define_netdev_printk_level(netdev_warn, KERN_WARNING);
7138 define_netdev_printk_level(netdev_notice, KERN_NOTICE);
7139 define_netdev_printk_level(netdev_info, KERN_INFO);
7140
7141 static void __net_exit netdev_exit(struct net *net)
7142 {
7143         kfree(net->dev_name_head);
7144         kfree(net->dev_index_head);
7145 }
7146
7147 static struct pernet_operations __net_initdata netdev_net_ops = {
7148         .init = netdev_init,
7149         .exit = netdev_exit,
7150 };
7151
7152 static void __net_exit default_device_exit(struct net *net)
7153 {
7154         struct net_device *dev, *aux;
7155         /*
7156          * Push all migratable network devices back to the
7157          * initial network namespace
7158          */
7159         rtnl_lock();
7160         for_each_netdev_safe(net, dev, aux) {
7161                 int err;
7162                 char fb_name[IFNAMSIZ];
7163
7164                 /* Ignore unmoveable devices (i.e. loopback) */
7165                 if (dev->features & NETIF_F_NETNS_LOCAL)
7166                         continue;
7167
7168                 /* Leave virtual devices for the generic cleanup */
7169                 if (dev->rtnl_link_ops)
7170                         continue;
7171
7172                 /* Push remaining network devices to init_net */
7173                 snprintf(fb_name, IFNAMSIZ, "dev%d", dev->ifindex);
7174                 err = dev_change_net_namespace(dev, &init_net, fb_name);
7175                 if (err) {
7176                         pr_emerg("%s: failed to move %s to init_net: %d\n",
7177                                  __func__, dev->name, err);
7178                         BUG();
7179                 }
7180         }
7181         rtnl_unlock();
7182 }
7183
7184 static void __net_exit rtnl_lock_unregistering(struct list_head *net_list)
7185 {
7186         /* Return with the rtnl_lock held when there are no network
7187          * devices unregistering in any network namespace in net_list.
7188          */
7189         struct net *net;
7190         bool unregistering;
7191         DEFINE_WAIT(wait);
7192
7193         for (;;) {
7194                 prepare_to_wait(&netdev_unregistering_wq, &wait,
7195                                 TASK_UNINTERRUPTIBLE);
7196                 unregistering = false;
7197                 rtnl_lock();
7198                 list_for_each_entry(net, net_list, exit_list) {
7199                         if (net->dev_unreg_count > 0) {
7200                                 unregistering = true;
7201                                 break;
7202                         }
7203                 }
7204                 if (!unregistering)
7205                         break;
7206                 __rtnl_unlock();
7207                 schedule();
7208         }
7209         finish_wait(&netdev_unregistering_wq, &wait);
7210 }
7211
7212 static void __net_exit default_device_exit_batch(struct list_head *net_list)
7213 {
7214         /* At exit all network devices most be removed from a network
7215          * namespace.  Do this in the reverse order of registration.
7216          * Do this across as many network namespaces as possible to
7217          * improve batching efficiency.
7218          */
7219         struct net_device *dev;
7220         struct net *net;
7221         LIST_HEAD(dev_kill_list);
7222
7223         /* To prevent network device cleanup code from dereferencing
7224          * loopback devices or network devices that have been freed
7225          * wait here for all pending unregistrations to complete,
7226          * before unregistring the loopback device and allowing the
7227          * network namespace be freed.
7228          *
7229          * The netdev todo list containing all network devices
7230          * unregistrations that happen in default_device_exit_batch
7231          * will run in the rtnl_unlock() at the end of
7232          * default_device_exit_batch.
7233          */
7234         rtnl_lock_unregistering(net_list);
7235         list_for_each_entry(net, net_list, exit_list) {
7236                 for_each_netdev_reverse(net, dev) {
7237                         if (dev->rtnl_link_ops)
7238                                 dev->rtnl_link_ops->dellink(dev, &dev_kill_list);
7239                         else
7240                                 unregister_netdevice_queue(dev, &dev_kill_list);
7241                 }
7242         }
7243         unregister_netdevice_many(&dev_kill_list);
7244         rtnl_unlock();
7245 }
7246
7247 static struct pernet_operations __net_initdata default_device_ops = {
7248         .exit = default_device_exit,
7249         .exit_batch = default_device_exit_batch,
7250 };
7251
7252 /*
7253  *      Initialize the DEV module. At boot time this walks the device list and
7254  *      unhooks any devices that fail to initialise (normally hardware not
7255  *      present) and leaves us with a valid list of present and active devices.
7256  *
7257  */
7258
7259 /*
7260  *       This is called single threaded during boot, so no need
7261  *       to take the rtnl semaphore.
7262  */
7263 static int __init net_dev_init(void)
7264 {
7265         int i, rc = -ENOMEM;
7266
7267         BUG_ON(!dev_boot_phase);
7268
7269         if (dev_proc_init())
7270                 goto out;
7271
7272         if (netdev_kobject_init())
7273                 goto out;
7274
7275         INIT_LIST_HEAD(&ptype_all);
7276         for (i = 0; i < PTYPE_HASH_SIZE; i++)
7277                 INIT_LIST_HEAD(&ptype_base[i]);
7278
7279         INIT_LIST_HEAD(&offload_base);
7280
7281         if (register_pernet_subsys(&netdev_net_ops))
7282                 goto out;
7283
7284         /*
7285          *      Initialise the packet receive queues.
7286          */
7287
7288         for_each_possible_cpu(i) {
7289                 struct softnet_data *sd = &per_cpu(softnet_data, i);
7290
7291                 skb_queue_head_init(&sd->input_pkt_queue);
7292                 skb_queue_head_init(&sd->process_queue);
7293                 INIT_LIST_HEAD(&sd->poll_list);
7294                 sd->output_queue_tailp = &sd->output_queue;
7295 #ifdef CONFIG_RPS
7296                 sd->csd.func = rps_trigger_softirq;
7297                 sd->csd.info = sd;
7298                 sd->cpu = i;
7299 #endif
7300
7301                 sd->backlog.poll = process_backlog;
7302                 sd->backlog.weight = weight_p;
7303         }
7304
7305         dev_boot_phase = 0;
7306
7307         /* The loopback device is special if any other network devices
7308          * is present in a network namespace the loopback device must
7309          * be present. Since we now dynamically allocate and free the
7310          * loopback device ensure this invariant is maintained by
7311          * keeping the loopback device as the first device on the
7312          * list of network devices.  Ensuring the loopback devices
7313          * is the first device that appears and the last network device
7314          * that disappears.
7315          */
7316         if (register_pernet_device(&loopback_net_ops))
7317                 goto out;
7318
7319         if (register_pernet_device(&default_device_ops))
7320                 goto out;
7321
7322         open_softirq(NET_TX_SOFTIRQ, net_tx_action);
7323         open_softirq(NET_RX_SOFTIRQ, net_rx_action);
7324
7325         hotcpu_notifier(dev_cpu_callback, 0);
7326         dst_init();
7327         rc = 0;
7328 out:
7329         return rc;
7330 }
7331
7332 subsys_initcall(net_dev_init);