net/core/dev.c

   1 /*
   2  *      NET3    Protocol independent device support routines.
   3  *
   4  *              This program is free software; you can redistribute it and/or
   5  *              modify it under the terms of the GNU General Public License
   6  *              as published by the Free Software Foundation; either version
   7  *              2 of the License, or (at your option) any later version.
   8  *
   9  *      Derived from the non IP parts of dev.c 1.0.19
  10  *              Authors:        Ross Biro
  11  *                              Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
  12  *                              Mark Evans, <evansmp@uhura.aston.ac.uk>
  13  *
  14  *      Additional Authors:
  15  *              Florian la Roche <rzsfl@rz.uni-sb.de>
  16  *              Alan Cox <gw4pts@gw4pts.ampr.org>
  17  *              David Hinds <dahinds@users.sourceforge.net>
  18  *              Alexey Kuznetsov <kuznet@ms2.inr.ac.ru>
  19  *              Adam Sulmicki <adam@cfar.umd.edu>
  20  *              Pekka Riikonen <priikone@poesidon.pspt.fi>
  21  *
  22  *      Changes:
  23  *              D.J. Barrow     :       Fixed bug where dev->refcnt gets set
  24  *                                      to 2 if register_netdev gets called
  25  *                                      before net_dev_init & also removed a
  26  *                                      few lines of code in the process.
  27  *              Alan Cox        :       device private ioctl copies fields back.
  28  *              Alan Cox        :       Transmit queue code does relevant
  29  *                                      stunts to keep the queue safe.
  30  *              Alan Cox        :       Fixed double lock.
  31  *              Alan Cox        :       Fixed promisc NULL pointer trap
  32  *              ????????        :       Support the full private ioctl range
  33  *              Alan Cox        :       Moved ioctl permission check into
  34  *                                      drivers
  35  *              Tim Kordas      :       SIOCADDMULTI/SIOCDELMULTI
  36  *              Alan Cox        :       100 backlog just doesn't cut it when
  37  *                                      you start doing multicast video 8)
  38  *              Alan Cox        :       Rewrote net_bh and list manager.
  39  *              Alan Cox        :       Fix ETH_P_ALL echoback lengths.
  40  *              Alan Cox        :       Took out transmit every packet pass
  41  *                                      Saved a few bytes in the ioctl handler
  42  *              Alan Cox        :       Network driver sets packet type before
  43  *                                      calling netif_rx. Saves a function
  44  *                                      call a packet.
  45  *              Alan Cox        :       Hashed net_bh()
  46  *              Richard Kooijman:       Timestamp fixes.
  47  *              Alan Cox        :       Wrong field in SIOCGIFDSTADDR
  48  *              Alan Cox        :       Device lock protection.
  49  *              Alan Cox        :       Fixed nasty side effect of device close
  50  *                                      changes.
  51  *              Rudi Cilibrasi  :       Pass the right thing to
  52  *                                      set_mac_address()
  53  *              Dave Miller     :       32bit quantity for the device lock to
  54  *                                      make it work out on a Sparc.
  55  *              Bjorn Ekwall    :       Added KERNELD hack.
  56  *              Alan Cox        :       Cleaned up the backlog initialise.
  57  *              Craig Metz      :       SIOCGIFCONF fix if space for under
  58  *                                      1 device.
  59  *          Thomas Bogendoerfer :       Return ENODEV for dev_open, if there
  60  *                                      is no device open function.
  61  *              Andi Kleen      :       Fix error reporting for SIOCGIFCONF
  62  *          Michael Chastain    :       Fix signed/unsigned for SIOCGIFCONF
  63  *              Cyrus Durgin    :       Cleaned for KMOD
  64  *              Adam Sulmicki   :       Bug Fix : Network Device Unload
  65  *                                      A network device unload needs to purge
  66  *                                      the backlog queue.
  67  *      Paul Rusty Russell      :       SIOCSIFNAME
  68  *              Pekka Riikonen  :       Netdev boot-time settings code
  69  *              Andrew Morton   :       Make unregister_netdevice wait
  70  *                                      indefinitely on dev->refcnt
  71  *              J Hadi Salim    :       - Backlog queue sampling
  72  *                                      - netif_rx() feedback
  73  */
  74
  75 #include <asm/uaccess.h>
  76 #include <asm/system.h>
  77 #include <linux/bitops.h>
  78 #include <linux/capability.h>
  79 #include <linux/cpu.h>
  80 #include <linux/types.h>
  81 #include <linux/kernel.h>
  82 #include <linux/hash.h>
  83 #include <linux/slab.h>
  84 #include <linux/sched.h>
  85 #include <linux/mutex.h>
  86 #include <linux/string.h>
  87 #include <linux/mm.h>
  88 #include <linux/socket.h>
  89 #include <linux/sockios.h>
  90 #include <linux/errno.h>
  91 #include <linux/interrupt.h>
  92 #include <linux/if_ether.h>
  93 #include <linux/netdevice.h>
  94 #include <linux/etherdevice.h>
  95 #include <linux/ethtool.h>
  96 #include <linux/notifier.h>
  97 #include <linux/skbuff.h>
  98 #include <net/net_namespace.h>
  99 #include <net/sock.h>
 100 #include <linux/rtnetlink.h>
 101 #include <linux/proc_fs.h>
 102 #include <linux/seq_file.h>
 103 #include <linux/stat.h>
 104 #include <net/dst.h>
 105 #include <net/pkt_sched.h>
 106 #include <net/checksum.h>
 107 #include <net/xfrm.h>
 108 #include <linux/highmem.h>
 109 #include <linux/init.h>
 110 #include <linux/kmod.h>
 111 #include <linux/module.h>
 112 #include <linux/netpoll.h>
 113 #include <linux/rcupdate.h>
 114 #include <linux/delay.h>
 115 #include <net/wext.h>
 116 #include <net/iw_handler.h>
 117 #include <asm/current.h>
 118 #include <linux/audit.h>
 119 #include <linux/dmaengine.h>
 120 #include <linux/err.h>
 121 #include <linux/ctype.h>
 122 #include <linux/if_arp.h>
 123 #include <linux/if_vlan.h>
 124 #include <linux/ip.h>
 125 #include <net/ip.h>
 126 #include <linux/ipv6.h>
 127 #include <linux/in.h>
 128 #include <linux/jhash.h>
 129 #include <linux/random.h>
 130 #include <trace/events/napi.h>
 131 #include <trace/events/net.h>
 132 #include <trace/events/skb.h>
 133 #include <linux/pci.h>
 134 #include <linux/inetdevice.h>
 135 #include <linux/cpu_rmap.h>
 136 #include <linux/net_tstamp.h>
 137 #include <linux/jump_label.h>
 138 #include <net/flow_keys.h>
 139
 140 #include "net-sysfs.h"
 141
 142 /* Instead of increasing this, you should create a hash table. */
 143 #define MAX_GRO_SKBS 8
 144
 145 /* This should be increased if a protocol with a bigger head is added. */
 146 #define GRO_MAX_HEAD (MAX_HEADER + 128)
 147
 148 /*
 149  *      The list of packet types we will receive (as opposed to discard)
 150  *      and the routines to invoke.
 151  *
 152  *      Why 16. Because with 16 the only overlap we get on a hash of the
 153  *      low nibble of the protocol value is RARP/SNAP/X.25.
 154  *
 155  *      NOTE:  That is no longer true with the addition of VLAN tags.  Not
 156  *             sure which should go first, but I bet it won't make much
 157  *             difference if we are running VLANs.  The good news is that
 158  *             this protocol won't be in the list unless compiled in, so
 159  *             the average user (w/out VLANs) will not be adversely affected.
 160  *             --BLG
 161  *
 162  *              0800    IP
 163  *              8100    802.1Q VLAN
 164  *              0001    802.3
 165  *              0002    AX.25
 166  *              0004    802.2
 167  *              8035    RARP
 168  *              0005    SNAP
 169  *              0805    X.25
 170  *              0806    ARP
 171  *              8137    IPX
 172  *              0009    Localtalk
 173  *              86DD    IPv6
 174  */
 175
 176 #define PTYPE_HASH_SIZE (16)
 177 #define PTYPE_HASH_MASK (PTYPE_HASH_SIZE - 1)
 178
 179 static DEFINE_SPINLOCK(ptype_lock);
 180 static struct list_head ptype_base[PTYPE_HASH_SIZE] __read_mostly;
 181 static struct list_head ptype_all __read_mostly;        /* Taps */
 182
 183 /*
 184  * The @dev_base_head list is protected by @dev_base_lock and the rtnl
 185  * semaphore.
 186  *
 187  * Pure readers hold dev_base_lock for reading, or rcu_read_lock()
 188  *
 189  * Writers must hold the rtnl semaphore while they loop through the
 190  * dev_base_head list, and hold dev_base_lock for writing when they do the
 191  * actual updates.  This allows pure readers to access the list even
 192  * while a writer is preparing to update it.
 193  *
 194  * To put it another way, dev_base_lock is held for writing only to
 195  * protect against pure readers; the rtnl semaphore provides the
 196  * protection against other writers.
 197  *
 198  * See, for example usages, register_netdevice() and
 199  * unregister_netdevice(), which must be called with the rtnl
 200  * semaphore held.
 201  */
 202 DEFINE_RWLOCK(dev_base_lock);
 203 EXPORT_SYMBOL(dev_base_lock);
 204
 205 static inline void dev_base_seq_inc(struct net *net)
 206 {
 207         while (++net->dev_base_seq == 0);
 208 }
 209
 210 static inline struct hlist_head *dev_name_hash(struct net *net, const char *name)
 211 {
 212         unsigned hash = full_name_hash(name, strnlen(name, IFNAMSIZ));
 213         return &net->dev_name_head[hash_32(hash, NETDEV_HASHBITS)];
 214 }
 215
 216 static inline struct hlist_head *dev_index_hash(struct net *net, int ifindex)
 217 {
 218         return &net->dev_index_head[ifindex & (NETDEV_HASHENTRIES - 1)];
 219 }
 220
 221 static inline void rps_lock(struct softnet_data *sd)
 222 {
 223 #ifdef CONFIG_RPS
 224         spin_lock(&sd->input_pkt_queue.lock);
 225 #endif
 226 }
 227
 228 static inline void rps_unlock(struct softnet_data *sd)
 229 {
 230 #ifdef CONFIG_RPS
 231         spin_unlock(&sd->input_pkt_queue.lock);
 232 #endif
 233 }
 234
 235 /* Device list insertion */
 236 static int list_netdevice(struct net_device *dev)
 237 {
 238         struct net *net = dev_net(dev);
 239
 240         ASSERT_RTNL();
 241
 242         write_lock_bh(&dev_base_lock);
 243         list_add_tail_rcu(&dev->dev_list, &net->dev_base_head);
 244         hlist_add_head_rcu(&dev->name_hlist, dev_name_hash(net, dev->name));
 245         hlist_add_head_rcu(&dev->index_hlist,
 246                            dev_index_hash(net, dev->ifindex));
 247         write_unlock_bh(&dev_base_lock);
 248
 249         dev_base_seq_inc(net);
 250
 251         return 0;
 252 }
 253
 254 /* Device list removal
 255  * caller must respect a RCU grace period before freeing/reusing dev
 256  */
 257 static void unlist_netdevice(struct net_device *dev)
 258 {
 259         ASSERT_RTNL();
 260
 261         /* Unlink dev from the device chain */
 262         write_lock_bh(&dev_base_lock);
 263         list_del_rcu(&dev->dev_list);
 264         hlist_del_rcu(&dev->name_hlist);
 265         hlist_del_rcu(&dev->index_hlist);
 266         write_unlock_bh(&dev_base_lock);
 267
 268         dev_base_seq_inc(dev_net(dev));
 269 }
 270
 271 /*
 272  *      Our notifier list
 273  */
 274
 275 static RAW_NOTIFIER_HEAD(netdev_chain);
 276
 277 /*
 278  *      Device drivers call our routines to queue packets here. We empty the
 279  *      queue in the local softnet handler.
 280  */
 281
 282 DEFINE_PER_CPU_ALIGNED(struct softnet_data, softnet_data);
 283 EXPORT_PER_CPU_SYMBOL(softnet_data);
 284
 285 #ifdef CONFIG_LOCKDEP
 286 /*
 287  * register_netdevice() inits txq->_xmit_lock and sets lockdep class
 288  * according to dev->type
 289  */
 290 static const unsigned short netdev_lock_type[] =
 291         {ARPHRD_NETROM, ARPHRD_ETHER, ARPHRD_EETHER, ARPHRD_AX25,
 292          ARPHRD_PRONET, ARPHRD_CHAOS, ARPHRD_IEEE802, ARPHRD_ARCNET,
 293          ARPHRD_APPLETLK, ARPHRD_DLCI, ARPHRD_ATM, ARPHRD_METRICOM,
 294          ARPHRD_IEEE1394, ARPHRD_EUI64, ARPHRD_INFINIBAND, ARPHRD_SLIP,
 295          ARPHRD_CSLIP, ARPHRD_SLIP6, ARPHRD_CSLIP6, ARPHRD_RSRVD,
 296          ARPHRD_ADAPT, ARPHRD_ROSE, ARPHRD_X25, ARPHRD_HWX25,
 297          ARPHRD_PPP, ARPHRD_CISCO, ARPHRD_LAPB, ARPHRD_DDCMP,
 298          ARPHRD_RAWHDLC, ARPHRD_TUNNEL, ARPHRD_TUNNEL6, ARPHRD_FRAD,
 299          ARPHRD_SKIP, ARPHRD_LOOPBACK, ARPHRD_LOCALTLK, ARPHRD_FDDI,
 300          ARPHRD_BIF, ARPHRD_SIT, ARPHRD_IPDDP, ARPHRD_IPGRE,
 301          ARPHRD_PIMREG, ARPHRD_HIPPI, ARPHRD_ASH, ARPHRD_ECONET,
 302          ARPHRD_IRDA, ARPHRD_FCPP, ARPHRD_FCAL, ARPHRD_FCPL,
 303          ARPHRD_FCFABRIC, ARPHRD_IEEE802_TR, ARPHRD_IEEE80211,
 304          ARPHRD_IEEE80211_PRISM, ARPHRD_IEEE80211_RADIOTAP, ARPHRD_PHONET,
 305          ARPHRD_PHONET_PIPE, ARPHRD_IEEE802154,
 306          ARPHRD_VOID, ARPHRD_NONE};
 307
 308 static const char *const netdev_lock_name[] =
 309         {"_xmit_NETROM", "_xmit_ETHER", "_xmit_EETHER", "_xmit_AX25",
 310          "_xmit_PRONET", "_xmit_CHAOS", "_xmit_IEEE802", "_xmit_ARCNET",
 311          "_xmit_APPLETLK", "_xmit_DLCI", "_xmit_ATM", "_xmit_METRICOM",
 312          "_xmit_IEEE1394", "_xmit_EUI64", "_xmit_INFINIBAND", "_xmit_SLIP",
 313          "_xmit_CSLIP", "_xmit_SLIP6", "_xmit_CSLIP6", "_xmit_RSRVD",
 314          "_xmit_ADAPT", "_xmit_ROSE", "_xmit_X25", "_xmit_HWX25",
 315          "_xmit_PPP", "_xmit_CISCO", "_xmit_LAPB", "_xmit_DDCMP",
 316          "_xmit_RAWHDLC", "_xmit_TUNNEL", "_xmit_TUNNEL6", "_xmit_FRAD",
 317          "_xmit_SKIP", "_xmit_LOOPBACK", "_xmit_LOCALTLK", "_xmit_FDDI",
 318          "_xmit_BIF", "_xmit_SIT", "_xmit_IPDDP", "_xmit_IPGRE",
 319          "_xmit_PIMREG", "_xmit_HIPPI", "_xmit_ASH", "_xmit_ECONET",
 320          "_xmit_IRDA", "_xmit_FCPP", "_xmit_FCAL", "_xmit_FCPL",
 321          "_xmit_FCFABRIC", "_xmit_IEEE802_TR", "_xmit_IEEE80211",
 322          "_xmit_IEEE80211_PRISM", "_xmit_IEEE80211_RADIOTAP", "_xmit_PHONET",
 323          "_xmit_PHONET_PIPE", "_xmit_IEEE802154",
 324          "_xmit_VOID", "_xmit_NONE"};
 325
 326 static struct lock_class_key netdev_xmit_lock_key[ARRAY_SIZE(netdev_lock_type)];
 327 static struct lock_class_key netdev_addr_lock_key[ARRAY_SIZE(netdev_lock_type)];
 328
 329 static inline unsigned short netdev_lock_pos(unsigned short dev_type)
 330 {
 331         int i;
 332
 333         for (i = 0; i < ARRAY_SIZE(netdev_lock_type); i++)
 334                 if (netdev_lock_type[i] == dev_type)
 335                         return i;
 336         /* the last key is used by default */
 337         return ARRAY_SIZE(netdev_lock_type) - 1;
 338 }
 339
 340 static inline void netdev_set_xmit_lockdep_class(spinlock_t *lock,
 341                                                  unsigned short dev_type)
 342 {
 343         int i;
 344
 345         i = netdev_lock_pos(dev_type);
 346         lockdep_set_class_and_name(lock, &netdev_xmit_lock_key[i],
 347                                    netdev_lock_name[i]);
 348 }
 349
 350 static inline void netdev_set_addr_lockdep_class(struct net_device *dev)
 351 {
 352         int i;
 353
 354         i = netdev_lock_pos(dev->type);
 355         lockdep_set_class_and_name(&dev->addr_list_lock,
 356                                    &netdev_addr_lock_key[i],
 357                                    netdev_lock_name[i]);
 358 }
 359 #else
 360 static inline void netdev_set_xmit_lockdep_class(spinlock_t *lock,
 361                                                  unsigned short dev_type)
 362 {
 363 }
 364 static inline void netdev_set_addr_lockdep_class(struct net_device *dev)
 365 {
 366 }
 367 #endif
 368
 369 /*******************************************************************************
 370
 371                 Protocol management and registration routines
 372
 373 *******************************************************************************/
 374
 375 /*
 376  *      Add a protocol ID to the list. Now that the input handler is
 377  *      smarter we can dispense with all the messy stuff that used to be
 378  *      here.
 379  *
 380  *      BEWARE!!! Protocol handlers, mangling input packets,
 381  *      MUST BE last in hash buckets and checking protocol handlers
 382  *      MUST start from promiscuous ptype_all chain in net_bh.
 383  *      It is true now, do not change it.
 384  *      Explanation follows: if protocol handler, mangling packet, will
 385  *      be the first on list, it is not able to sense, that packet
 386  *      is cloned and should be copied-on-write, so that it will
 387  *      change it and subsequent readers will get broken packet.
 388  *                                                      --ANK (980803)
 389  */
 390
 391 static inline struct list_head *ptype_head(const struct packet_type *pt)
 392 {
 393         if (pt->type == htons(ETH_P_ALL))
 394                 return &ptype_all;
 395         else
 396                 return &ptype_base[ntohs(pt->type) & PTYPE_HASH_MASK];
 397 }
 398
 399 /**
 400  *      dev_add_pack - add packet handler
 401  *      @pt: packet type declaration
 402  *
 403  *      Add a protocol handler to the networking stack. The passed &packet_type
 404  *      is linked into kernel lists and may not be freed until it has been
 405  *      removed from the kernel lists.
 406  *
 407  *      This call does not sleep therefore it can not
 408  *      guarantee all CPU's that are in middle of receiving packets
 409  *      will see the new packet type (until the next received packet).
 410  */
 411
 412 void dev_add_pack(struct packet_type *pt)
 413 {
 414         struct list_head *head = ptype_head(pt);
 415
 416         spin_lock(&ptype_lock);
 417         list_add_rcu(&pt->list, head);
 418         spin_unlock(&ptype_lock);
 419 }
 420 EXPORT_SYMBOL(dev_add_pack);
 421
 422 /**
 423  *      __dev_remove_pack        - remove packet handler
 424  *      @pt: packet type declaration
 425  *
 426  *      Remove a protocol handler that was previously added to the kernel
 427  *      protocol handlers by dev_add_pack(). The passed &packet_type is removed
 428  *      from the kernel lists and can be freed or reused once this function
 429  *      returns.
 430  *
 431  *      The packet type might still be in use by receivers
 432  *      and must not be freed until after all the CPU's have gone
 433  *      through a quiescent state.
 434  */
 435 void __dev_remove_pack(struct packet_type *pt)
 436 {
 437         struct list_head *head = ptype_head(pt);
 438         struct packet_type *pt1;
 439
 440         spin_lock(&ptype_lock);
 441
 442         list_for_each_entry(pt1, head, list) {
 443                 if (pt == pt1) {
 444                         list_del_rcu(&pt->list);
 445                         goto out;
 446                 }
 447         }
 448
 449         printk(KERN_WARNING "dev_remove_pack: %p not found.\n", pt);
 450 out:
 451         spin_unlock(&ptype_lock);
 452 }
 453 EXPORT_SYMBOL(__dev_remove_pack);
 454
 455 /**
 456  *      dev_remove_pack  - remove packet handler
 457  *      @pt: packet type declaration
 458  *
 459  *      Remove a protocol handler that was previously added to the kernel
 460  *      protocol handlers by dev_add_pack(). The passed &packet_type is removed
 461  *      from the kernel lists and can be freed or reused once this function
 462  *      returns.
 463  *
 464  *      This call sleeps to guarantee that no CPU is looking at the packet
 465  *      type after return.
 466  */
 467 void dev_remove_pack(struct packet_type *pt)
 468 {
 469         __dev_remove_pack(pt);
 470
 471         synchronize_net();
 472 }
 473 EXPORT_SYMBOL(dev_remove_pack);
 474
 475 /******************************************************************************
 476
 477                       Device Boot-time Settings Routines
 478
 479 *******************************************************************************/
 480
 481 /* Boot time configuration table */
 482 static struct netdev_boot_setup dev_boot_setup[NETDEV_BOOT_SETUP_MAX];
 483
 484 /**
 485  *      netdev_boot_setup_add   - add new setup entry
 486  *      @name: name of the device
 487  *      @map: configured settings for the device
 488  *
 489  *      Adds new setup entry to the dev_boot_setup list.  The function
 490  *      returns 0 on error and 1 on success.  This is a generic routine to
 491  *      all netdevices.
 492  */
 493 static int netdev_boot_setup_add(char *name, struct ifmap *map)
 494 {
 495         struct netdev_boot_setup *s;
 496         int i;
 497
 498         s = dev_boot_setup;
 499         for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++) {
 500                 if (s[i].name[0] == '\0' || s[i].name[0] == ' ') {
 501                         memset(s[i].name, 0, sizeof(s[i].name));
 502                         strlcpy(s[i].name, name, IFNAMSIZ);
 503                         memcpy(&s[i].map, map, sizeof(s[i].map));
 504                         break;
 505                 }
 506         }
 507
 508         return i >= NETDEV_BOOT_SETUP_MAX ? 0 : 1;
 509 }
 510
 511 /**
 512  *      netdev_boot_setup_check - check boot time settings
 513  *      @dev: the netdevice
 514  *
 515  *      Check boot time settings for the device.
 516  *      The found settings are set for the device to be used
 517  *      later in the device probing.
 518  *      Returns 0 if no settings found, 1 if they are.
 519  */
 520 int netdev_boot_setup_check(struct net_device *dev)
 521 {
 522         struct netdev_boot_setup *s = dev_boot_setup;
 523         int i;
 524
 525         for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++) {
 526                 if (s[i].name[0] != '\0' && s[i].name[0] != ' ' &&
 527                     !strcmp(dev->name, s[i].name)) {
 528                         dev->irq        = s[i].map.irq;
 529                         dev->base_addr  = s[i].map.base_addr;
 530                         dev->mem_start  = s[i].map.mem_start;
 531                         dev->mem_end    = s[i].map.mem_end;
 532                         return 1;
 533                 }
 534         }
 535         return 0;
 536 }
 537 EXPORT_SYMBOL(netdev_boot_setup_check);
 538
 539
 540 /**
 541  *      netdev_boot_base        - get address from boot time settings
 542  *      @prefix: prefix for network device
 543  *      @unit: id for network device
 544  *
 545  *      Check boot time settings for the base address of device.
 546  *      The found settings are set for the device to be used
 547  *      later in the device probing.
 548  *      Returns 0 if no settings found.
 549  */
 550 unsigned long netdev_boot_base(const char *prefix, int unit)
 551 {
 552         const struct netdev_boot_setup *s = dev_boot_setup;
 553         char name[IFNAMSIZ];
 554         int i;
 555
 556         sprintf(name, "%s%d", prefix, unit);
 557
 558         /*
 559          * If device already registered then return base of 1
 560          * to indicate not to probe for this interface
 561          */
 562         if (__dev_get_by_name(&init_net, name))
 563                 return 1;
 564
 565         for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++)
 566                 if (!strcmp(name, s[i].name))
 567                         return s[i].map.base_addr;
 568         return 0;
 569 }
 570
 571 /*
 572  * Saves at boot time configured settings for any netdevice.
 573  */
 574 int __init netdev_boot_setup(char *str)
 575 {
 576         int ints[5];
 577         struct ifmap map;
 578
 579         str = get_options(str, ARRAY_SIZE(ints), ints);
 580         if (!str || !*str)
 581                 return 0;
 582
 583         /* Save settings */
 584         memset(&map, 0, sizeof(map));
 585         if (ints[0] > 0)
 586                 map.irq = ints[1];
 587         if (ints[0] > 1)
 588                 map.base_addr = ints[2];
 589         if (ints[0] > 2)
 590                 map.mem_start = ints[3];
 591         if (ints[0] > 3)
 592                 map.mem_end = ints[4];
 593
 594         /* Add new entry to the list */
 595         return netdev_boot_setup_add(str, &map);
 596 }
 597
 598 __setup("netdev=", netdev_boot_setup);
 599
 600 /*******************************************************************************
 601
 602                             Device Interface Subroutines
 603
 604 *******************************************************************************/
 605
 606 /**
 607  *      __dev_get_by_name       - find a device by its name
 608  *      @net: the applicable net namespace
 609  *      @name: name to find
 610  *
 611  *      Find an interface by name. Must be called under RTNL semaphore
 612  *      or @dev_base_lock. If the name is found a pointer to the device
 613  *      is returned. If the name is not found then %NULL is returned. The
 614  *      reference counters are not incremented so the caller must be
 615  *      careful with locks.
 616  */
 617
 618 struct net_device *__dev_get_by_name(struct net *net, const char *name)
 619 {
 620         struct hlist_node *p;
 621         struct net_device *dev;
 622         struct hlist_head *head = dev_name_hash(net, name);
 623
 624         hlist_for_each_entry(dev, p, head, name_hlist)
 625                 if (!strncmp(dev->name, name, IFNAMSIZ))
 626                         return dev;
 627
 628         return NULL;
 629 }
 630 EXPORT_SYMBOL(__dev_get_by_name);
 631
 632 /**
 633  *      dev_get_by_name_rcu     - find a device by its name
 634  *      @net: the applicable net namespace
 635  *      @name: name to find
 636  *
 637  *      Find an interface by name.
 638  *      If the name is found a pointer to the device is returned.
 639  *      If the name is not found then %NULL is returned.
 640  *      The reference counters are not incremented so the caller must be
 641  *      careful with locks. The caller must hold RCU lock.
 642  */
 643
 644 struct net_device *dev_get_by_name_rcu(struct net *net, const char *name)
 645 {
 646         struct hlist_node *p;
 647         struct net_device *dev;
 648         struct hlist_head *head = dev_name_hash(net, name);
 649
 650         hlist_for_each_entry_rcu(dev, p, head, name_hlist)
 651                 if (!strncmp(dev->name, name, IFNAMSIZ))
 652                         return dev;
 653
 654         return NULL;
 655 }
 656 EXPORT_SYMBOL(dev_get_by_name_rcu);
 657
 658 /**
 659  *      dev_get_by_name         - find a device by its name
 660  *      @net: the applicable net namespace
 661  *      @name: name to find
 662  *
 663  *      Find an interface by name. This can be called from any
 664  *      context and does its own locking. The returned handle has
 665  *      the usage count incremented and the caller must use dev_put() to
 666  *      release it when it is no longer needed. %NULL is returned if no
 667  *      matching device is found.
 668  */
 669
 670 struct net_device *dev_get_by_name(struct net *net, const char *name)
 671 {
 672         struct net_device *dev;
 673
 674         rcu_read_lock();
 675         dev = dev_get_by_name_rcu(net, name);
 676         if (dev)
 677                 dev_hold(dev);
 678         rcu_read_unlock();
 679         return dev;
 680 }
 681 EXPORT_SYMBOL(dev_get_by_name);
 682
 683 /**
 684  *      __dev_get_by_index - find a device by its ifindex
 685  *      @net: the applicable net namespace
 686  *      @ifindex: index of device
 687  *
 688  *      Search for an interface by index. Returns %NULL if the device
 689  *      is not found or a pointer to the device. The device has not
 690  *      had its reference counter increased so the caller must be careful
 691  *      about locking. The caller must hold either the RTNL semaphore
 692  *      or @dev_base_lock.
 693  */
 694
 695 struct net_device *__dev_get_by_index(struct net *net, int ifindex)
 696 {
 697         struct hlist_node *p;
 698         struct net_device *dev;
 699         struct hlist_head *head = dev_index_hash(net, ifindex);
 700
 701         hlist_for_each_entry(dev, p, head, index_hlist)
 702                 if (dev->ifindex == ifindex)
 703                         return dev;
 704
 705         return NULL;
 706 }
 707 EXPORT_SYMBOL(__dev_get_by_index);
 708
 709 /**
 710  *      dev_get_by_index_rcu - find a device by its ifindex
 711  *      @net: the applicable net namespace
 712  *      @ifindex: index of device
 713  *
 714  *      Search for an interface by index. Returns %NULL if the device
 715  *      is not found or a pointer to the device. The device has not
 716  *      had its reference counter increased so the caller must be careful
 717  *      about locking. The caller must hold RCU lock.
 718  */
 719
 720 struct net_device *dev_get_by_index_rcu(struct net *net, int ifindex)
 721 {
 722         struct hlist_node *p;
 723         struct net_device *dev;
 724         struct hlist_head *head = dev_index_hash(net, ifindex);
 725
 726         hlist_for_each_entry_rcu(dev, p, head, index_hlist)
 727                 if (dev->ifindex == ifindex)
 728                         return dev;
 729
 730         return NULL;
 731 }
 732 EXPORT_SYMBOL(dev_get_by_index_rcu);
 733
 734
 735 /**
 736  *      dev_get_by_index - find a device by its ifindex
 737  *      @net: the applicable net namespace
 738  *      @ifindex: index of device
 739  *
 740  *      Search for an interface by index. Returns NULL if the device
 741  *      is not found or a pointer to the device. The device returned has
 742  *      had a reference added and the pointer is safe until the user calls
 743  *      dev_put to indicate they have finished with it.
 744  */
 745
 746 struct net_device *dev_get_by_index(struct net *net, int ifindex)
 747 {
 748         struct net_device *dev;
 749
 750         rcu_read_lock();
 751         dev = dev_get_by_index_rcu(net, ifindex);
 752         if (dev)
 753                 dev_hold(dev);
 754         rcu_read_unlock();
 755         return dev;
 756 }
 757 EXPORT_SYMBOL(dev_get_by_index);
 758
 759 /**
 760  *      dev_getbyhwaddr_rcu - find a device by its hardware address
 761  *      @net: the applicable net namespace
 762  *      @type: media type of device
 763  *      @ha: hardware address
 764  *
 765  *      Search for an interface by MAC address. Returns NULL if the device
 766  *      is not found or a pointer to the device.
 767  *      The caller must hold RCU or RTNL.
 768  *      The returned device has not had its ref count increased
 769  *      and the caller must therefore be careful about locking
 770  *
 771  */
 772
 773 struct net_device *dev_getbyhwaddr_rcu(struct net *net, unsigned short type,
 774                                        const char *ha)
 775 {
 776         struct net_device *dev;
 777
 778         for_each_netdev_rcu(net, dev)
 779                 if (dev->type == type &&
 780                     !memcmp(dev->dev_addr, ha, dev->addr_len))
 781                         return dev;
 782
 783         return NULL;
 784 }
 785 EXPORT_SYMBOL(dev_getbyhwaddr_rcu);
 786
 787 struct net_device *__dev_getfirstbyhwtype(struct net *net, unsigned short type)
 788 {
 789         struct net_device *dev;
 790
 791         ASSERT_RTNL();
 792         for_each_netdev(net, dev)
 793                 if (dev->type == type)
 794                         return dev;
 795
 796         return NULL;
 797 }
 798 EXPORT_SYMBOL(__dev_getfirstbyhwtype);
 799
 800 struct net_device *dev_getfirstbyhwtype(struct net *net, unsigned short type)
 801 {
 802         struct net_device *dev, *ret = NULL;
 803
 804         rcu_read_lock();
 805         for_each_netdev_rcu(net, dev)
 806                 if (dev->type == type) {
 807                         dev_hold(dev);
 808                         ret = dev;
 809                         break;
 810                 }
 811         rcu_read_unlock();
 812         return ret;
 813 }
 814 EXPORT_SYMBOL(dev_getfirstbyhwtype);
 815
 816 /**
 817  *      dev_get_by_flags_rcu - find any device with given flags
 818  *      @net: the applicable net namespace
 819  *      @if_flags: IFF_* values
 820  *      @mask: bitmask of bits in if_flags to check
 821  *
 822  *      Search for any interface with the given flags. Returns NULL if a device
 823  *      is not found or a pointer to the device. Must be called inside
 824  *      rcu_read_lock(), and result refcount is unchanged.
 825  */
 826
 827 struct net_device *dev_get_by_flags_rcu(struct net *net, unsigned short if_flags,
 828                                     unsigned short mask)
 829 {
 830         struct net_device *dev, *ret;
 831
 832         ret = NULL;
 833         for_each_netdev_rcu(net, dev) {
 834                 if (((dev->flags ^ if_flags) & mask) == 0) {
 835                         ret = dev;
 836                         break;
 837                 }
 838         }
 839         return ret;
 840 }
 841 EXPORT_SYMBOL(dev_get_by_flags_rcu);
 842
 843 /**
 844  *      dev_valid_name - check if name is okay for network device
 845  *      @name: name string
 846  *
 847  *      Network device names need to be valid file names to
 848  *      to allow sysfs to work.  We also disallow any kind of
 849  *      whitespace.
 850  */
 851 int dev_valid_name(const char *name)
 852 {
 853         if (*name == '\0')
 854                 return 0;
 855         if (strlen(name) >= IFNAMSIZ)
 856                 return 0;
 857         if (!strcmp(name, ".") || !strcmp(name, ".."))
 858                 return 0;
 859
 860         while (*name) {
 861                 if (*name == '/' || isspace(*name))
 862                         return 0;
 863                 name++;
 864         }
 865         return 1;
 866 }
 867 EXPORT_SYMBOL(dev_valid_name);
 868
 869 /**
 870  *      __dev_alloc_name - allocate a name for a device
 871  *      @net: network namespace to allocate the device name in
 872  *      @name: name format string
 873  *      @buf:  scratch buffer and result name string
 874  *
 875  *      Passed a format string - eg "lt%d" it will try and find a suitable
 876  *      id. It scans list of devices to build up a free map, then chooses
 877  *      the first empty slot. The caller must hold the dev_base or rtnl lock
 878  *      while allocating the name and adding the device in order to avoid
 879  *      duplicates.
 880  *      Limited to bits_per_byte * page size devices (ie 32K on most platforms).
 881  *      Returns the number of the unit assigned or a negative errno code.
 882  */
 883
 884 static int __dev_alloc_name(struct net *net, const char *name, char *buf)
 885 {
 886         int i = 0;
 887         const char *p;
 888         const int max_netdevices = 8*PAGE_SIZE;
 889         unsigned long *inuse;
 890         struct net_device *d;
 891
 892         p = strnchr(name, IFNAMSIZ-1, '%');
 893         if (p) {
 894                 /*
 895                  * Verify the string as this thing may have come from
 896                  * the user.  There must be either one "%d" and no other "%"
 897                  * characters.
 898                  */
 899                 if (p[1] != 'd' || strchr(p + 2, '%'))
 900                         return -EINVAL;
 901
 902                 /* Use one page as a bit array of possible slots */
 903                 inuse = (unsigned long *) get_zeroed_page(GFP_ATOMIC);
 904                 if (!inuse)
 905                         return -ENOMEM;
 906
 907                 for_each_netdev(net, d) {
 908                         if (!sscanf(d->name, name, &i))
 909                                 continue;
 910                         if (i < 0 || i >= max_netdevices)
 911                                 continue;
 912
 913                         /*  avoid cases where sscanf is not exact inverse of printf */
 914                         snprintf(buf, IFNAMSIZ, name, i);
 915                         if (!strncmp(buf, d->name, IFNAMSIZ))
 916                                 set_bit(i, inuse);
 917                 }
 918
 919                 i = find_first_zero_bit(inuse, max_netdevices);
 920                 free_page((unsigned long) inuse);
 921         }
 922
 923         if (buf != name)
 924                 snprintf(buf, IFNAMSIZ, name, i);
 925         if (!__dev_get_by_name(net, buf))
 926                 return i;
 927
 928         /* It is possible to run out of possible slots
 929          * when the name is long and there isn't enough space left
 930          * for the digits, or if all bits are used.
 931          */
 932         return -ENFILE;
 933 }
 934
 935 /**
 936  *      dev_alloc_name - allocate a name for a device
 937  *      @dev: device
 938  *      @name: name format string
 939  *
 940  *      Passed a format string - eg "lt%d" it will try and find a suitable
 941  *      id. It scans list of devices to build up a free map, then chooses
 942  *      the first empty slot. The caller must hold the dev_base or rtnl lock
 943  *      while allocating the name and adding the device in order to avoid
 944  *      duplicates.
 945  *      Limited to bits_per_byte * page size devices (ie 32K on most platforms).
 946  *      Returns the number of the unit assigned or a negative errno code.
 947  */
 948
 949 int dev_alloc_name(struct net_device *dev, const char *name)
 950 {
 951         char buf[IFNAMSIZ];
 952         struct net *net;
 953         int ret;
 954
 955         BUG_ON(!dev_net(dev));
 956         net = dev_net(dev);
 957         ret = __dev_alloc_name(net, name, buf);
 958         if (ret >= 0)
 959                 strlcpy(dev->name, buf, IFNAMSIZ);
 960         return ret;
 961 }
 962 EXPORT_SYMBOL(dev_alloc_name);
 963
 964 static int dev_get_valid_name(struct net_device *dev, const char *name)
 965 {
 966         struct net *net;
 967
 968         BUG_ON(!dev_net(dev));
 969         net = dev_net(dev);
 970
 971         if (!dev_valid_name(name))
 972                 return -EINVAL;
 973
 974         if (strchr(name, '%'))
 975                 return dev_alloc_name(dev, name);
 976         else if (__dev_get_by_name(net, name))
 977                 return -EEXIST;
 978         else if (dev->name != name)
 979                 strlcpy(dev->name, name, IFNAMSIZ);
 980
 981         return 0;
 982 }
 983
 984 /**
 985  *      dev_change_name - change name of a device
 986  *      @dev: device
 987  *      @newname: name (or format string) must be at least IFNAMSIZ
 988  *
 989  *      Change name of a device, can pass format strings "eth%d".
 990  *      for wildcarding.
 991  */
 992 int dev_change_name(struct net_device *dev, const char *newname)
 993 {
 994         char oldname[IFNAMSIZ];
 995         int err = 0;
 996         int ret;
 997         struct net *net;
 998
 999         ASSERT_RTNL();
1000         BUG_ON(!dev_net(dev));
1001
1002         net = dev_net(dev);
1003         if (dev->flags & IFF_UP)
1004                 return -EBUSY;
1005
1006         if (strncmp(newname, dev->name, IFNAMSIZ) == 0)
1007                 return 0;
1008
1009         memcpy(oldname, dev->name, IFNAMSIZ);
1010
1011         err = dev_get_valid_name(dev, newname);
1012         if (err < 0)
1013                 return err;
1014
1015 rollback:
1016         ret = device_rename(&dev->dev, dev->name);
1017         if (ret) {
1018                 memcpy(dev->name, oldname, IFNAMSIZ);
1019                 return ret;
1020         }
1021
1022         write_lock_bh(&dev_base_lock);
1023         hlist_del_rcu(&dev->name_hlist);
1024         write_unlock_bh(&dev_base_lock);
1025
1026         synchronize_rcu();
1027
1028         write_lock_bh(&dev_base_lock);
1029         hlist_add_head_rcu(&dev->name_hlist, dev_name_hash(net, dev->name));
1030         write_unlock_bh(&dev_base_lock);
1031
1032         ret = call_netdevice_notifiers(NETDEV_CHANGENAME, dev);
1033         ret = notifier_to_errno(ret);
1034
1035         if (ret) {
1036                 /* err >= 0 after dev_alloc_name() or stores the first errno */
1037                 if (err >= 0) {
1038                         err = ret;
1039                         memcpy(dev->name, oldname, IFNAMSIZ);
1040                         goto rollback;
1041                 } else {
1042                         printk(KERN_ERR
1043                                "%s: name change rollback failed: %d.\n",
1044                                dev->name, ret);
1045                 }
1046         }
1047
1048         return err;
1049 }
1050
1051 /**
1052  *      dev_set_alias - change ifalias of a device
1053  *      @dev: device
1054  *      @alias: name up to IFALIASZ
1055  *      @len: limit of bytes to copy from info
1056  *
1057  *      Set ifalias for a device,
1058  */
1059 int dev_set_alias(struct net_device *dev, const char *alias, size_t len)
1060 {
1061         ASSERT_RTNL();
1062
1063         if (len >= IFALIASZ)
1064                 return -EINVAL;
1065
1066         if (!len) {
1067                 if (dev->ifalias) {
1068                         kfree(dev->ifalias);
1069                         dev->ifalias = NULL;
1070                 }
1071                 return 0;
1072         }
1073
1074         dev->ifalias = krealloc(dev->ifalias, len + 1, GFP_KERNEL);
1075         if (!dev->ifalias)
1076                 return -ENOMEM;
1077
1078         strlcpy(dev->ifalias, alias, len+1);
1079         return len;
1080 }
1081
1082
1083 /**
1084  *      netdev_features_change - device changes features
1085  *      @dev: device to cause notification
1086  *
1087  *      Called to indicate a device has changed features.
1088  */
1089 void netdev_features_change(struct net_device *dev)
1090 {
1091         call_netdevice_notifiers(NETDEV_FEAT_CHANGE, dev);
1092 }
1093 EXPORT_SYMBOL(netdev_features_change);
1094
1095 /**
1096  *      netdev_state_change - device changes state
1097  *      @dev: device to cause notification
1098  *
1099  *      Called to indicate a device has changed state. This function calls
1100  *      the notifier chains for netdev_chain and sends a NEWLINK message
1101  *      to the routing socket.
1102  */
1103 void netdev_state_change(struct net_device *dev)
1104 {
1105         if (dev->flags & IFF_UP) {
1106                 call_netdevice_notifiers(NETDEV_CHANGE, dev);
1107                 rtmsg_ifinfo(RTM_NEWLINK, dev, 0);
1108         }
1109 }
1110 EXPORT_SYMBOL(netdev_state_change);
1111
1112 int netdev_bonding_change(struct net_device *dev, unsigned long event)
1113 {
1114         return call_netdevice_notifiers(event, dev);
1115 }
1116 EXPORT_SYMBOL(netdev_bonding_change);
1117
1118 /**
1119  *      dev_load        - load a network module
1120  *      @net: the applicable net namespace
1121  *      @name: name of interface
1122  *
1123  *      If a network interface is not present and the process has suitable
1124  *      privileges this function loads the module. If module loading is not
1125  *      available in this kernel then it becomes a nop.
1126  */
1127
1128 void dev_load(struct net *net, const char *name)
1129 {
1130         struct net_device *dev;
1131         int no_module;
1132
1133         rcu_read_lock();
1134         dev = dev_get_by_name_rcu(net, name);
1135         rcu_read_unlock();
1136
1137         no_module = !dev;
1138         if (no_module && capable(CAP_NET_ADMIN))
1139                 no_module = request_module("netdev-%s", name);
1140         if (no_module && capable(CAP_SYS_MODULE)) {
1141                 if (!request_module("%s", name))
1142                         pr_err("Loading kernel module for a network device "
1143 "with CAP_SYS_MODULE (deprecated).  Use CAP_NET_ADMIN and alias netdev-%s "
1144 "instead\n", name);
1145         }
1146 }
1147 EXPORT_SYMBOL(dev_load);
1148
1149 static int __dev_open(struct net_device *dev)
1150 {
1151         const struct net_device_ops *ops = dev->netdev_ops;
1152         int ret;
1153
1154         ASSERT_RTNL();
1155
1156         if (!netif_device_present(dev))
1157                 return -ENODEV;
1158
1159         ret = call_netdevice_notifiers(NETDEV_PRE_UP, dev);
1160         ret = notifier_to_errno(ret);
1161         if (ret)
1162                 return ret;
1163
1164         set_bit(__LINK_STATE_START, &dev->state);
1165
1166         if (ops->ndo_validate_addr)
1167                 ret = ops->ndo_validate_addr(dev);
1168
1169         if (!ret && ops->ndo_open)
1170                 ret = ops->ndo_open(dev);
1171
1172         if (ret)
1173                 clear_bit(__LINK_STATE_START, &dev->state);
1174         else {
1175                 dev->flags |= IFF_UP;
1176                 net_dmaengine_get();
1177                 dev_set_rx_mode(dev);
1178                 dev_activate(dev);
1179         }
1180
1181         return ret;
1182 }
1183
1184 /**
1185  *      dev_open        - prepare an interface for use.
1186  *      @dev:   device to open
1187  *
1188  *      Takes a device from down to up state. The device's private open
1189  *      function is invoked and then the multicast lists are loaded. Finally
1190  *      the device is moved into the up state and a %NETDEV_UP message is
1191  *      sent to the netdev notifier chain.
1192  *
1193  *      Calling this function on an active interface is a nop. On a failure
1194  *      a negative errno code is returned.
1195  */
1196 int dev_open(struct net_device *dev)
1197 {
1198         int ret;
1199
1200         if (dev->flags & IFF_UP)
1201                 return 0;
1202
1203         ret = __dev_open(dev);
1204         if (ret < 0)
1205                 return ret;
1206
1207         rtmsg_ifinfo(RTM_NEWLINK, dev, IFF_UP|IFF_RUNNING);
1208         call_netdevice_notifiers(NETDEV_UP, dev);
1209
1210         return ret;
1211 }
1212 EXPORT_SYMBOL(dev_open);
1213
1214 static int __dev_close_many(struct list_head *head)
1215 {
1216         struct net_device *dev;
1217
1218         ASSERT_RTNL();
1219         might_sleep();
1220
1221         list_for_each_entry(dev, head, unreg_list) {
1222                 call_netdevice_notifiers(NETDEV_GOING_DOWN, dev);
1223
1224                 clear_bit(__LINK_STATE_START, &dev->state);
1225
1226                 /* Synchronize to scheduled poll. We cannot touch poll list, it
1227                  * can be even on different cpu. So just clear netif_running().
1228                  *
1229                  * dev->stop() will invoke napi_disable() on all of it's
1230                  * napi_struct instances on this device.
1231                  */
1232                 smp_mb__after_clear_bit(); /* Commit netif_running(). */
1233         }
1234
1235         dev_deactivate_many(head);
1236
1237         list_for_each_entry(dev, head, unreg_list) {
1238                 const struct net_device_ops *ops = dev->netdev_ops;
1239
1240                 /*
1241                  *      Call the device specific close. This cannot fail.
1242                  *      Only if device is UP
1243                  *
1244                  *      We allow it to be called even after a DETACH hot-plug
1245                  *      event.
1246                  */
1247                 if (ops->ndo_stop)
1248                         ops->ndo_stop(dev);
1249
1250                 dev->flags &= ~IFF_UP;
1251                 net_dmaengine_put();
1252         }
1253
1254         return 0;
1255 }
1256
1257 static int __dev_close(struct net_device *dev)
1258 {
1259         int retval;
1260         LIST_HEAD(single);
1261
1262         list_add(&dev->unreg_list, &single);
1263         retval = __dev_close_many(&single);
1264         list_del(&single);
1265         return retval;
1266 }
1267
1268 static int dev_close_many(struct list_head *head)
1269 {
1270         struct net_device *dev, *tmp;
1271         LIST_HEAD(tmp_list);
1272
1273         list_for_each_entry_safe(dev, tmp, head, unreg_list)
1274                 if (!(dev->flags & IFF_UP))
1275                         list_move(&dev->unreg_list, &tmp_list);
1276
1277         __dev_close_many(head);
1278
1279         list_for_each_entry(dev, head, unreg_list) {
1280                 rtmsg_ifinfo(RTM_NEWLINK, dev, IFF_UP|IFF_RUNNING);
1281                 call_netdevice_notifiers(NETDEV_DOWN, dev);
1282         }
1283
1284         /* rollback_registered_many needs the complete original list */
1285         list_splice(&tmp_list, head);
1286         return 0;
1287 }
1288
1289 /**
1290  *      dev_close - shutdown an interface.
1291  *      @dev: device to shutdown
1292  *
1293  *      This function moves an active device into down state. A
1294  *      %NETDEV_GOING_DOWN is sent to the netdev notifier chain. The device
1295  *      is then deactivated and finally a %NETDEV_DOWN is sent to the notifier
1296  *      chain.
1297  */
1298 int dev_close(struct net_device *dev)
1299 {
1300         if (dev->flags & IFF_UP) {
1301                 LIST_HEAD(single);
1302
1303                 list_add(&dev->unreg_list, &single);
1304                 dev_close_many(&single);
1305                 list_del(&single);
1306         }
1307         return 0;
1308 }
1309 EXPORT_SYMBOL(dev_close);
1310
1311
1312 /**
1313  *      dev_disable_lro - disable Large Receive Offload on a device
1314  *      @dev: device
1315  *
1316  *      Disable Large Receive Offload (LRO) on a net device.  Must be
1317  *      called under RTNL.  This is needed if received packets may be
1318  *      forwarded to another interface.
1319  */
1320 void dev_disable_lro(struct net_device *dev)
1321 {
1322         /*
1323          * If we're trying to disable lro on a vlan device
1324          * use the underlying physical device instead
1325          */
1326         if (is_vlan_dev(dev))
1327                 dev = vlan_dev_real_dev(dev);
1328
1329         dev->wanted_features &= ~NETIF_F_LRO;
1330         netdev_update_features(dev);
1331
1332         if (unlikely(dev->features & NETIF_F_LRO))
1333                 netdev_WARN(dev, "failed to disable LRO!\n");
1334 }
1335 EXPORT_SYMBOL(dev_disable_lro);
1336
1337
1338 static int dev_boot_phase = 1;
1339
1340 /**
1341  *      register_netdevice_notifier - register a network notifier block
1342  *      @nb: notifier
1343  *
1344  *      Register a notifier to be called when network device events occur.
1345  *      The notifier passed is linked into the kernel structures and must
1346  *      not be reused until it has been unregistered. A negative errno code
1347  *      is returned on a failure.
1348  *
1349  *      When registered all registration and up events are replayed
1350  *      to the new notifier to allow device to have a race free
1351  *      view of the network device list.
1352  */
1353
1354 int register_netdevice_notifier(struct notifier_block *nb)
1355 {
1356         struct net_device *dev;
1357         struct net_device *last;
1358         struct net *net;
1359         int err;
1360
1361         rtnl_lock();
1362         err = raw_notifier_chain_register(&netdev_chain, nb);
1363         if (err)
1364                 goto unlock;
1365         if (dev_boot_phase)
1366                 goto unlock;
1367         for_each_net(net) {
1368                 for_each_netdev(net, dev) {
1369                         err = nb->notifier_call(nb, NETDEV_REGISTER, dev);
1370                         err = notifier_to_errno(err);
1371                         if (err)
1372                                 goto rollback;
1373
1374                         if (!(dev->flags & IFF_UP))
1375                                 continue;
1376
1377                         nb->notifier_call(nb, NETDEV_UP, dev);
1378                 }
1379         }
1380
1381 unlock:
1382         rtnl_unlock();
1383         return err;
1384
1385 rollback:
1386         last = dev;
1387         for_each_net(net) {
1388                 for_each_netdev(net, dev) {
1389                         if (dev == last)
1390                                 goto outroll;
1391
1392                         if (dev->flags & IFF_UP) {
1393                                 nb->notifier_call(nb, NETDEV_GOING_DOWN, dev);
1394                                 nb->notifier_call(nb, NETDEV_DOWN, dev);
1395                         }
1396                         nb->notifier_call(nb, NETDEV_UNREGISTER, dev);
1397                         nb->notifier_call(nb, NETDEV_UNREGISTER_BATCH, dev);
1398                 }
1399         }
1400
1401 outroll:
1402         raw_notifier_chain_unregister(&netdev_chain, nb);
1403         goto unlock;
1404 }
1405 EXPORT_SYMBOL(register_netdevice_notifier);
1406
1407 /**
1408  *      unregister_netdevice_notifier - unregister a network notifier block
1409  *      @nb: notifier
1410  *
1411  *      Unregister a notifier previously registered by
1412  *      register_netdevice_notifier(). The notifier is unlinked into the
1413  *      kernel structures and may then be reused. A negative errno code
1414  *      is returned on a failure.
1415  *
1416  *      After unregistering unregister and down device events are synthesized
1417  *      for all devices on the device list to the removed notifier to remove
1418  *      the need for special case cleanup code.
1419  */
1420
1421 int unregister_netdevice_notifier(struct notifier_block *nb)
1422 {
1423         struct net_device *dev;
1424         struct net *net;
1425         int err;
1426
1427         rtnl_lock();
1428         err = raw_notifier_chain_unregister(&netdev_chain, nb);
1429         if (err)
1430                 goto unlock;
1431
1432         for_each_net(net) {
1433                 for_each_netdev(net, dev) {
1434                         if (dev->flags & IFF_UP) {
1435                                 nb->notifier_call(nb, NETDEV_GOING_DOWN, dev);
1436                                 nb->notifier_call(nb, NETDEV_DOWN, dev);
1437                         }
1438                         nb->notifier_call(nb, NETDEV_UNREGISTER, dev);
1439                         nb->notifier_call(nb, NETDEV_UNREGISTER_BATCH, dev);
1440                 }
1441         }
1442 unlock:
1443         rtnl_unlock();
1444         return err;
1445 }
1446 EXPORT_SYMBOL(unregister_netdevice_notifier);
1447
1448 /**
1449  *      call_netdevice_notifiers - call all network notifier blocks
1450  *      @val: value passed unmodified to notifier function
1451  *      @dev: net_device pointer passed unmodified to notifier function
1452  *
1453  *      Call all network notifier blocks.  Parameters and return value
1454  *      are as for raw_notifier_call_chain().
1455  */
1456
1457 int call_netdevice_notifiers(unsigned long val, struct net_device *dev)
1458 {
1459         ASSERT_RTNL();
1460         return raw_notifier_call_chain(&netdev_chain, val, dev);
1461 }
1462 EXPORT_SYMBOL(call_netdevice_notifiers);
1463
1464 static struct jump_label_key netstamp_needed __read_mostly;
1465 #ifdef HAVE_JUMP_LABEL
1466 /* We are not allowed to call jump_label_dec() from irq context
1467  * If net_disable_timestamp() is called from irq context, defer the
1468  * jump_label_dec() calls.
1469  */
1470 static atomic_t netstamp_needed_deferred;
1471 #endif
1472
1473 void net_enable_timestamp(void)
1474 {
1475 #ifdef HAVE_JUMP_LABEL
1476         int deferred = atomic_xchg(&netstamp_needed_deferred, 0);
1477
1478         if (deferred) {
1479                 while (--deferred)
1480                         jump_label_dec(&netstamp_needed);
1481                 return;
1482         }
1483 #endif
1484         WARN_ON(in_interrupt());
1485         jump_label_inc(&netstamp_needed);
1486 }
1487 EXPORT_SYMBOL(net_enable_timestamp);
1488
1489 void net_disable_timestamp(void)
1490 {
1491 #ifdef HAVE_JUMP_LABEL
1492         if (in_interrupt()) {
1493                 atomic_inc(&netstamp_needed_deferred);
1494                 return;
1495         }
1496 #endif
1497         jump_label_dec(&netstamp_needed);
1498 }
1499 EXPORT_SYMBOL(net_disable_timestamp);
1500
1501 static inline void net_timestamp_set(struct sk_buff *skb)
1502 {
1503         skb->tstamp.tv64 = 0;
1504         if (static_branch(&netstamp_needed))
1505                 __net_timestamp(skb);
1506 }
1507
1508 #define net_timestamp_check(COND, SKB)                  \
1509         if (static_branch(&netstamp_needed)) {          \
1510                 if ((COND) && !(SKB)->tstamp.tv64)      \
1511                         __net_timestamp(SKB);           \
1512         }                                               \
1513
1514 static int net_hwtstamp_validate(struct ifreq *ifr)
1515 {
1516         struct hwtstamp_config cfg;
1517         enum hwtstamp_tx_types tx_type;
1518         enum hwtstamp_rx_filters rx_filter;
1519         int tx_type_valid = 0;
1520         int rx_filter_valid = 0;
1521
1522         if (copy_from_user(&cfg, ifr->ifr_data, sizeof(cfg)))
1523                 return -EFAULT;
1524
1525         if (cfg.flags) /* reserved for future extensions */
1526                 return -EINVAL;
1527
1528         tx_type = cfg.tx_type;
1529         rx_filter = cfg.rx_filter;
1530
1531         switch (tx_type) {
1532         case HWTSTAMP_TX_OFF:
1533         case HWTSTAMP_TX_ON:
1534         case HWTSTAMP_TX_ONESTEP_SYNC:
1535                 tx_type_valid = 1;
1536                 break;
1537         }
1538
1539         switch (rx_filter) {
1540         case HWTSTAMP_FILTER_NONE:
1541         case HWTSTAMP_FILTER_ALL:
1542         case HWTSTAMP_FILTER_SOME:
1543         case HWTSTAMP_FILTER_PTP_V1_L4_EVENT:
1544         case HWTSTAMP_FILTER_PTP_V1_L4_SYNC:
1545         case HWTSTAMP_FILTER_PTP_V1_L4_DELAY_REQ:
1546         case HWTSTAMP_FILTER_PTP_V2_L4_EVENT:
1547         case HWTSTAMP_FILTER_PTP_V2_L4_SYNC:
1548         case HWTSTAMP_FILTER_PTP_V2_L4_DELAY_REQ:
1549         case HWTSTAMP_FILTER_PTP_V2_L2_EVENT:
1550         case HWTSTAMP_FILTER_PTP_V2_L2_SYNC:
1551         case HWTSTAMP_FILTER_PTP_V2_L2_DELAY_REQ:
1552         case HWTSTAMP_FILTER_PTP_V2_EVENT:
1553         case HWTSTAMP_FILTER_PTP_V2_SYNC:
1554         case HWTSTAMP_FILTER_PTP_V2_DELAY_REQ:
1555                 rx_filter_valid = 1;
1556                 break;
1557         }
1558
1559         if (!tx_type_valid || !rx_filter_valid)
1560                 return -ERANGE;
1561
1562         return 0;
1563 }
1564
1565 static inline bool is_skb_forwardable(struct net_device *dev,
1566                                       struct sk_buff *skb)
1567 {
1568         unsigned int len;
1569
1570         if (!(dev->flags & IFF_UP))
1571                 return false;
1572
1573         len = dev->mtu + dev->hard_header_len + VLAN_HLEN;
1574         if (skb->len <= len)
1575                 return true;
1576
1577         /* if TSO is enabled, we don't care about the length as the packet
1578          * could be forwarded without being segmented before
1579          */
1580         if (skb_is_gso(skb))
1581                 return true;
1582
1583         return false;
1584 }
1585
1586 /**
1587  * dev_forward_skb - loopback an skb to another netif
1588  *
1589  * @dev: destination network device
1590  * @skb: buffer to forward
1591  *
1592  * return values:
1593  *      NET_RX_SUCCESS  (no congestion)
1594  *      NET_RX_DROP     (packet was dropped, but freed)
1595  *
1596  * dev_forward_skb can be used for injecting an skb from the
1597  * start_xmit function of one device into the receive queue
1598  * of another device.
1599  *
1600  * The receiving device may be in another namespace, so
1601  * we have to clear all information in the skb that could
1602  * impact namespace isolation.
1603  */
1604 int dev_forward_skb(struct net_device *dev, struct sk_buff *skb)
1605 {
1606         if (skb_shinfo(skb)->tx_flags & SKBTX_DEV_ZEROCOPY) {
1607                 if (skb_copy_ubufs(skb, GFP_ATOMIC)) {
1608                         atomic_long_inc(&dev->rx_dropped);
1609                         kfree_skb(skb);
1610                         return NET_RX_DROP;
1611                 }
1612         }
1613
1614         skb_orphan(skb);
1615         nf_reset(skb);
1616
1617         if (unlikely(!is_skb_forwardable(dev, skb))) {
1618                 atomic_long_inc(&dev->rx_dropped);
1619                 kfree_skb(skb);
1620                 return NET_RX_DROP;
1621         }
1622         skb_set_dev(skb, dev);
1623         skb->tstamp.tv64 = 0;
1624         skb->pkt_type = PACKET_HOST;
1625         skb->protocol = eth_type_trans(skb, dev);
1626         return netif_rx(skb);
1627 }
1628 EXPORT_SYMBOL_GPL(dev_forward_skb);
1629
1630 static inline int deliver_skb(struct sk_buff *skb,
1631                               struct packet_type *pt_prev,
1632                               struct net_device *orig_dev)
1633 {
1634         atomic_inc(&skb->users);
1635         return pt_prev->func(skb, skb->dev, pt_prev, orig_dev);
1636 }
1637
1638 /*
1639  *      Support routine. Sends outgoing frames to any network
1640  *      taps currently in use.
1641  */
1642
1643 static void dev_queue_xmit_nit(struct sk_buff *skb, struct net_device *dev)
1644 {
1645         struct packet_type *ptype;
1646         struct sk_buff *skb2 = NULL;
1647         struct packet_type *pt_prev = NULL;
1648
1649         rcu_read_lock();
1650         list_for_each_entry_rcu(ptype, &ptype_all, list) {
1651                 /* Never send packets back to the socket
1652                  * they originated from - MvS (miquels@drinkel.ow.org)
1653                  */
1654                 if ((ptype->dev == dev || !ptype->dev) &&
1655                     (ptype->af_packet_priv == NULL ||
1656                      (struct sock *)ptype->af_packet_priv != skb->sk)) {
1657                         if (pt_prev) {
1658                                 deliver_skb(skb2, pt_prev, skb->dev);
1659                                 pt_prev = ptype;
1660                                 continue;
1661                         }
1662
1663                         skb2 = skb_clone(skb, GFP_ATOMIC);
1664                         if (!skb2)
1665                                 break;
1666
1667                         net_timestamp_set(skb2);
1668
1669                         /* skb->nh should be correctly
1670                            set by sender, so that the second statement is
1671                            just protection against buggy protocols.
1672                          */
1673                         skb_reset_mac_header(skb2);
1674
1675                         if (skb_network_header(skb2) < skb2->data ||
1676                             skb2->network_header > skb2->tail) {
1677                                 if (net_ratelimit())
1678                                         printk(KERN_CRIT "protocol %04x is "
1679                                                "buggy, dev %s\n",
1680                                                ntohs(skb2->protocol),
1681                                                dev->name);
1682                                 skb_reset_network_header(skb2);
1683                         }
1684
1685                         skb2->transport_header = skb2->network_header;
1686                         skb2->pkt_type = PACKET_OUTGOING;
1687                         pt_prev = ptype;
1688                 }
1689         }
1690         if (pt_prev)
1691                 pt_prev->func(skb2, skb->dev, pt_prev, skb->dev);
1692         rcu_read_unlock();
1693 }
1694
1695 /* netif_setup_tc - Handle tc mappings on real_num_tx_queues change
1696  * @dev: Network device
1697  * @txq: number of queues available
1698  *
1699  * If real_num_tx_queues is changed the tc mappings may no longer be
1700  * valid. To resolve this verify the tc mapping remains valid and if
1701  * not NULL the mapping. With no priorities mapping to this
1702  * offset/count pair it will no longer be used. In the worst case TC0
1703  * is invalid nothing can be done so disable priority mappings. If is
1704  * expected that drivers will fix this mapping if they can before
1705  * calling netif_set_real_num_tx_queues.
1706  */
1707 static void netif_setup_tc(struct net_device *dev, unsigned int txq)
1708 {
1709         int i;
1710         struct netdev_tc_txq *tc = &dev->tc_to_txq[0];
1711
1712         /* If TC0 is invalidated disable TC mapping */
1713         if (tc->offset + tc->count > txq) {
1714                 pr_warning("Number of in use tx queues changed "
1715                            "invalidating tc mappings. Priority "
1716                            "traffic classification disabled!\n");
1717                 dev->num_tc = 0;
1718                 return;
1719         }
1720
1721         /* Invalidated prio to tc mappings set to TC0 */
1722         for (i = 1; i < TC_BITMASK + 1; i++) {
1723                 int q = netdev_get_prio_tc_map(dev, i);
1724
1725                 tc = &dev->tc_to_txq[q];
1726                 if (tc->offset + tc->count > txq) {
1727                         pr_warning("Number of in use tx queues "
1728                                    "changed. Priority %i to tc "
1729                                    "mapping %i is no longer valid "
1730                                    "setting map to 0\n",
1731                                    i, q);
1732                         netdev_set_prio_tc_map(dev, i, 0);
1733                 }
1734         }
1735 }
1736
1737 /*
1738  * Routine to help set real_num_tx_queues. To avoid skbs mapped to queues
1739  * greater then real_num_tx_queues stale skbs on the qdisc must be flushed.
1740  */
1741 int netif_set_real_num_tx_queues(struct net_device *dev, unsigned int txq)
1742 {
1743         int rc;
1744
1745         if (txq < 1 || txq > dev->num_tx_queues)
1746                 return -EINVAL;
1747
1748         if (dev->reg_state == NETREG_REGISTERED ||
1749             dev->reg_state == NETREG_UNREGISTERING) {
1750                 ASSERT_RTNL();
1751
1752                 rc = netdev_queue_update_kobjects(dev, dev->real_num_tx_queues,
1753                                                   txq);
1754                 if (rc)
1755                         return rc;
1756
1757                 if (dev->num_tc)
1758                         netif_setup_tc(dev, txq);
1759
1760                 if (txq < dev->real_num_tx_queues)
1761                         qdisc_reset_all_tx_gt(dev, txq);
1762         }
1763
1764         dev->real_num_tx_queues = txq;
1765         return 0;
1766 }
1767 EXPORT_SYMBOL(netif_set_real_num_tx_queues);
1768
1769 #ifdef CONFIG_RPS
1770 /**
1771  *      netif_set_real_num_rx_queues - set actual number of RX queues used
1772  *      @dev: Network device
1773  *      @rxq: Actual number of RX queues
1774  *
1775  *      This must be called either with the rtnl_lock held or before
1776  *      registration of the net device.  Returns 0 on success, or a
1777  *      negative error code.  If called before registration, it always
1778  *      succeeds.
1779  */
1780 int netif_set_real_num_rx_queues(struct net_device *dev, unsigned int rxq)
1781 {
1782         int rc;
1783
1784         if (rxq < 1 || rxq > dev->num_rx_queues)
1785                 return -EINVAL;
1786
1787         if (dev->reg_state == NETREG_REGISTERED) {
1788                 ASSERT_RTNL();
1789
1790                 rc = net_rx_queue_update_kobjects(dev, dev->real_num_rx_queues,
1791                                                   rxq);
1792                 if (rc)
1793                         return rc;
1794         }
1795
1796         dev->real_num_rx_queues = rxq;
1797         return 0;
1798 }
1799 EXPORT_SYMBOL(netif_set_real_num_rx_queues);
1800 #endif
1801
1802 static inline void __netif_reschedule(struct Qdisc *q)
1803 {
1804         struct softnet_data *sd;
1805         unsigned long flags;
1806
1807         local_irq_save(flags);
1808         sd = &__get_cpu_var(softnet_data);
1809         q->next_sched = NULL;
1810         *sd->output_queue_tailp = q;
1811         sd->output_queue_tailp = &q->next_sched;
1812         raise_softirq_irqoff(NET_TX_SOFTIRQ);
1813         local_irq_restore(flags);
1814 }
1815
1816 void __netif_schedule(struct Qdisc *q)
1817 {
1818         if (!test_and_set_bit(__QDISC_STATE_SCHED, &q->state))
1819                 __netif_reschedule(q);
1820 }
1821 EXPORT_SYMBOL(__netif_schedule);
1822
1823 void dev_kfree_skb_irq(struct sk_buff *skb)
1824 {
1825         if (atomic_dec_and_test(&skb->users)) {
1826                 struct softnet_data *sd;
1827                 unsigned long flags;
1828
1829                 local_irq_save(flags);
1830                 sd = &__get_cpu_var(softnet_data);
1831                 skb->next = sd->completion_queue;
1832                 sd->completion_queue = skb;
1833                 raise_softirq_irqoff(NET_TX_SOFTIRQ);
1834                 local_irq_restore(flags);
1835         }
1836 }
1837 EXPORT_SYMBOL(dev_kfree_skb_irq);
1838
1839 void dev_kfree_skb_any(struct sk_buff *skb)
1840 {
1841         if (in_irq() || irqs_disabled())
1842                 dev_kfree_skb_irq(skb);
1843         else
1844                 dev_kfree_skb(skb);
1845 }
1846 EXPORT_SYMBOL(dev_kfree_skb_any);
1847
1848
1849 /**
1850  * netif_device_detach - mark device as removed
1851  * @dev: network device
1852  *
1853  * Mark device as removed from system and therefore no longer available.
1854  */
1855 void netif_device_detach(struct net_device *dev)
1856 {
1857         if (test_and_clear_bit(__LINK_STATE_PRESENT, &dev->state) &&
1858             netif_running(dev)) {
1859                 netif_tx_stop_all_queues(dev);
1860         }
1861 }
1862 EXPORT_SYMBOL(netif_device_detach);
1863
1864 /**
1865  * netif_device_attach - mark device as attached
1866  * @dev: network device
1867  *
1868  * Mark device as attached from system and restart if needed.
1869  */
1870 void netif_device_attach(struct net_device *dev)
1871 {
1872         if (!test_and_set_bit(__LINK_STATE_PRESENT, &dev->state) &&
1873             netif_running(dev)) {
1874                 netif_tx_wake_all_queues(dev);
1875                 __netdev_watchdog_up(dev);
1876         }
1877 }
1878 EXPORT_SYMBOL(netif_device_attach);
1879
1880 /**
1881  * skb_dev_set -- assign a new device to a buffer
1882  * @skb: buffer for the new device
1883  * @dev: network device
1884  *
1885  * If an skb is owned by a device already, we have to reset
1886  * all data private to the namespace a device belongs to
1887  * before assigning it a new device.
1888  */
1889 #ifdef CONFIG_NET_NS
1890 void skb_set_dev(struct sk_buff *skb, struct net_device *dev)
1891 {
1892         skb_dst_drop(skb);
1893         if (skb->dev && !net_eq(dev_net(skb->dev), dev_net(dev))) {
1894                 secpath_reset(skb);
1895                 nf_reset(skb);
1896                 skb_init_secmark(skb);
1897                 skb->mark = 0;
1898                 skb->priority = 0;
1899                 skb->nf_trace = 0;
1900                 skb->ipvs_property = 0;
1901 #ifdef CONFIG_NET_SCHED
1902                 skb->tc_index = 0;
1903 #endif
1904         }
1905         skb->dev = dev;
1906 }
1907 EXPORT_SYMBOL(skb_set_dev);
1908 #endif /* CONFIG_NET_NS */
1909
1910 static void skb_warn_bad_offload(const struct sk_buff *skb)
1911 {
1912         static const netdev_features_t null_features = 0;
1913         struct net_device *dev = skb->dev;
1914         const char *driver = "";
1915
1916         if (dev && dev->dev.parent)
1917                 driver = dev_driver_string(dev->dev.parent);
1918
1919         WARN(1, "%s: caps=(%pNF, %pNF) len=%d data_len=%d gso_size=%d "
1920              "gso_type=%d ip_summed=%d\n",
1921              driver, dev ? &dev->features : &null_features,
1922              skb->sk ? &skb->sk->sk_route_caps : &null_features,
1923              skb->len, skb->data_len, skb_shinfo(skb)->gso_size,
1924              skb_shinfo(skb)->gso_type, skb->ip_summed);
1925 }
1926
1927 /*
1928  * Invalidate hardware checksum when packet is to be mangled, and
1929  * complete checksum manually on outgoing path.
1930  */
1931 int skb_checksum_help(struct sk_buff *skb)
1932 {
1933         __wsum csum;
1934         int ret = 0, offset;
1935
1936         if (skb->ip_summed == CHECKSUM_COMPLETE)
1937                 goto out_set_summed;
1938
1939         if (unlikely(skb_shinfo(skb)->gso_size)) {
1940                 skb_warn_bad_offload(skb);
1941                 return -EINVAL;
1942         }
1943
1944         offset = skb_checksum_start_offset(skb);
1945         BUG_ON(offset >= skb_headlen(skb));
1946         csum = skb_checksum(skb, offset, skb->len - offset, 0);
1947
1948         offset += skb->csum_offset;
1949         BUG_ON(offset + sizeof(__sum16) > skb_headlen(skb));
1950
1951         if (skb_cloned(skb) &&
1952             !skb_clone_writable(skb, offset + sizeof(__sum16))) {
1953                 ret = pskb_expand_head(skb, 0, 0, GFP_ATOMIC);
1954                 if (ret)
1955                         goto out;
1956         }
1957
1958         *(__sum16 *)(skb->data + offset) = csum_fold(csum);
1959 out_set_summed:
1960         skb->ip_summed = CHECKSUM_NONE;
1961 out:
1962         return ret;
1963 }
1964 EXPORT_SYMBOL(skb_checksum_help);
1965
1966 /**
1967  *      skb_gso_segment - Perform segmentation on skb.
1968  *      @skb: buffer to segment
1969  *      @features: features for the output path (see dev->features)
1970  *
1971  *      This function segments the given skb and returns a list of segments.
1972  *
1973  *      It may return NULL if the skb requires no segmentation.  This is
1974  *      only possible when GSO is used for verifying header integrity.
1975  */
1976 struct sk_buff *skb_gso_segment(struct sk_buff *skb,
1977         netdev_features_t features)
1978 {
1979         struct sk_buff *segs = ERR_PTR(-EPROTONOSUPPORT);
1980         struct packet_type *ptype;
1981         __be16 type = skb->protocol;
1982         int vlan_depth = ETH_HLEN;
1983         int err;
1984
1985         while (type == htons(ETH_P_8021Q)) {
1986                 struct vlan_hdr *vh;
1987
1988                 if (unlikely(!pskb_may_pull(skb, vlan_depth + VLAN_HLEN)))
1989                         return ERR_PTR(-EINVAL);
1990
1991                 vh = (struct vlan_hdr *)(skb->data + vlan_depth);
1992                 type = vh->h_vlan_encapsulated_proto;
1993                 vlan_depth += VLAN_HLEN;
1994         }
1995
1996         skb_reset_mac_header(skb);
1997         skb->mac_len = skb->network_header - skb->mac_header;
1998         __skb_pull(skb, skb->mac_len);
1999
2000         if (unlikely(skb->ip_summed != CHECKSUM_PARTIAL)) {
2001                 skb_warn_bad_offload(skb);
2002
2003                 if (skb_header_cloned(skb) &&
2004                     (err = pskb_expand_head(skb, 0, 0, GFP_ATOMIC)))
2005                         return ERR_PTR(err);
2006         }
2007
2008         rcu_read_lock();
2009         list_for_each_entry_rcu(ptype,
2010                         &ptype_base[ntohs(type) & PTYPE_HASH_MASK], list) {
2011                 if (ptype->type == type && !ptype->dev && ptype->gso_segment) {
2012                         if (unlikely(skb->ip_summed != CHECKSUM_PARTIAL)) {
2013                                 err = ptype->gso_send_check(skb);
2014                                 segs = ERR_PTR(err);
2015                                 if (err || skb_gso_ok(skb, features))
2016                                         break;
2017                                 __skb_push(skb, (skb->data -
2018                                                  skb_network_header(skb)));
2019                         }
2020                         segs = ptype->gso_segment(skb, features);
2021                         break;
2022                 }
2023         }
2024         rcu_read_unlock();
2025
2026         __skb_push(skb, skb->data - skb_mac_header(skb));
2027
2028         return segs;
2029 }
2030 EXPORT_SYMBOL(skb_gso_segment);
2031
2032 /* Take action when hardware reception checksum errors are detected. */
2033 #ifdef CONFIG_BUG
2034 void netdev_rx_csum_fault(struct net_device *dev)
2035 {
2036         if (net_ratelimit()) {
2037                 printk(KERN_ERR "%s: hw csum failure.\n",
2038                         dev ? dev->name : "<unknown>");
2039                 dump_stack();
2040         }
2041 }
2042 EXPORT_SYMBOL(netdev_rx_csum_fault);
2043 #endif
2044
2045 /* Actually, we should eliminate this check as soon as we know, that:
2046  * 1. IOMMU is present and allows to map all the memory.
2047  * 2. No high memory really exists on this machine.
2048  */
2049
2050 static int illegal_highdma(struct net_device *dev, struct sk_buff *skb)
2051 {
2052 #ifdef CONFIG_HIGHMEM
2053         int i;
2054         if (!(dev->features & NETIF_F_HIGHDMA)) {
2055                 for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
2056                         skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
2057                         if (PageHighMem(skb_frag_page(frag)))
2058                                 return 1;
2059                 }
2060         }
2061
2062         if (PCI_DMA_BUS_IS_PHYS) {
2063                 struct device *pdev = dev->dev.parent;
2064
2065                 if (!pdev)
2066                         return 0;
2067                 for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
2068                         skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
2069                         dma_addr_t addr = page_to_phys(skb_frag_page(frag));
2070                         if (!pdev->dma_mask || addr + PAGE_SIZE - 1 > *pdev->dma_mask)
2071                                 return 1;
2072                 }
2073         }
2074 #endif
2075         return 0;
2076 }
2077
2078 struct dev_gso_cb {
2079         void (*destructor)(struct sk_buff *skb);
2080 };
2081
2082 #define DEV_GSO_CB(skb) ((struct dev_gso_cb *)(skb)->cb)
2083
2084 static void dev_gso_skb_destructor(struct sk_buff *skb)
2085 {
2086         struct dev_gso_cb *cb;
2087
2088         do {
2089                 struct sk_buff *nskb = skb->next;
2090
2091                 skb->next = nskb->next;
2092                 nskb->next = NULL;
2093                 kfree_skb(nskb);
2094         } while (skb->next);
2095
2096         cb = DEV_GSO_CB(skb);
2097         if (cb->destructor)
2098                 cb->destructor(skb);
2099 }
2100
2101 /**
2102  *      dev_gso_segment - Perform emulated hardware segmentation on skb.
2103  *      @skb: buffer to segment
2104  *      @features: device features as applicable to this skb
2105  *
2106  *      This function segments the given skb and stores the list of segments
2107  *      in skb->next.
2108  */
2109 static int dev_gso_segment(struct sk_buff *skb, netdev_features_t features)
2110 {
2111         struct sk_buff *segs;
2112
2113         segs = skb_gso_segment(skb, features);
2114
2115         /* Verifying header integrity only. */
2116         if (!segs)
2117                 return 0;
2118
2119         if (IS_ERR(segs))
2120                 return PTR_ERR(segs);
2121
2122         skb->next = segs;
2123         DEV_GSO_CB(skb)->destructor = skb->destructor;
2124         skb->destructor = dev_gso_skb_destructor;
2125
2126         return 0;
2127 }
2128
2129 /*
2130  * Try to orphan skb early, right before transmission by the device.
2131  * We cannot orphan skb if tx timestamp is requested or the sk-reference
2132  * is needed on driver level for other reasons, e.g. see net/can/raw.c
2133  */
2134 static inline void skb_orphan_try(struct sk_buff *skb)
2135 {
2136         struct sock *sk = skb->sk;
2137
2138         if (sk && !skb_shinfo(skb)->tx_flags) {
2139                 /* skb_tx_hash() wont be able to get sk.
2140                  * We copy sk_hash into skb->rxhash
2141                  */
2142                 if (!skb->rxhash)
2143                         skb->rxhash = sk->sk_hash;
2144                 skb_orphan(skb);
2145         }
2146 }
2147
2148 static bool can_checksum_protocol(netdev_features_t features, __be16 protocol)
2149 {
2150         return ((features & NETIF_F_GEN_CSUM) ||
2151                 ((features & NETIF_F_V4_CSUM) &&
2152                  protocol == htons(ETH_P_IP)) ||
2153                 ((features & NETIF_F_V6_CSUM) &&
2154                  protocol == htons(ETH_P_IPV6)) ||
2155                 ((features & NETIF_F_FCOE_CRC) &&
2156                  protocol == htons(ETH_P_FCOE)));
2157 }
2158
2159 static netdev_features_t harmonize_features(struct sk_buff *skb,
2160         __be16 protocol, netdev_features_t features)
2161 {
2162         if (!can_checksum_protocol(features, protocol)) {
2163                 features &= ~NETIF_F_ALL_CSUM;
2164                 features &= ~NETIF_F_SG;
2165         } else if (illegal_highdma(skb->dev, skb)) {
2166                 features &= ~NETIF_F_SG;
2167         }
2168
2169         return features;
2170 }
2171
2172 netdev_features_t netif_skb_features(struct sk_buff *skb)
2173 {
2174         __be16 protocol = skb->protocol;
2175         netdev_features_t features = skb->dev->features;
2176
2177         if (protocol == htons(ETH_P_8021Q)) {
2178                 struct vlan_ethhdr *veh = (struct vlan_ethhdr *)skb->data;
2179                 protocol = veh->h_vlan_encapsulated_proto;
2180         } else if (!vlan_tx_tag_present(skb)) {
2181                 return harmonize_features(skb, protocol, features);
2182         }
2183
2184         features &= (skb->dev->vlan_features | NETIF_F_HW_VLAN_TX);
2185
2186         if (protocol != htons(ETH_P_8021Q)) {
2187                 return harmonize_features(skb, protocol, features);
2188         } else {
2189                 features &= NETIF_F_SG | NETIF_F_HIGHDMA | NETIF_F_FRAGLIST |
2190                                 NETIF_F_GEN_CSUM | NETIF_F_HW_VLAN_TX;
2191                 return harmonize_features(skb, protocol, features);
2192         }
2193 }
2194 EXPORT_SYMBOL(netif_skb_features);
2195
2196 /*
2197  * Returns true if either:
2198  *      1. skb has frag_list and the device doesn't support FRAGLIST, or
2199  *      2. skb is fragmented and the device does not support SG, or if
2200  *         at least one of fragments is in highmem and device does not
2201  *         support DMA from it.
2202  */
2203 static inline int skb_needs_linearize(struct sk_buff *skb,
2204                                       int features)
2205 {
2206         return skb_is_nonlinear(skb) &&
2207                         ((skb_has_frag_list(skb) &&
2208                                 !(features & NETIF_F_FRAGLIST)) ||
2209                         (skb_shinfo(skb)->nr_frags &&
2210                                 !(features & NETIF_F_SG)));
2211 }
2212
2213 int dev_hard_start_xmit(struct sk_buff *skb, struct net_device *dev,
2214                         struct netdev_queue *txq)
2215 {
2216         const struct net_device_ops *ops = dev->netdev_ops;
2217         int rc = NETDEV_TX_OK;
2218         unsigned int skb_len;
2219
2220         if (likely(!skb->next)) {
2221                 netdev_features_t features;
2222
2223                 /*
2224                  * If device doesn't need skb->dst, release it right now while
2225                  * its hot in this cpu cache
2226                  */
2227                 if (dev->priv_flags & IFF_XMIT_DST_RELEASE)
2228                         skb_dst_drop(skb);
2229
2230                 if (!list_empty(&ptype_all))
2231                         dev_queue_xmit_nit(skb, dev);
2232
2233                 skb_orphan_try(skb);
2234
2235                 features = netif_skb_features(skb);
2236
2237                 if (vlan_tx_tag_present(skb) &&
2238                     !(features & NETIF_F_HW_VLAN_TX)) {
2239                         skb = __vlan_put_tag(skb, vlan_tx_tag_get(skb));
2240                         if (unlikely(!skb))
2241                                 goto out;
2242
2243                         skb->vlan_tci = 0;
2244                 }
2245
2246                 if (netif_needs_gso(skb, features)) {
2247                         if (unlikely(dev_gso_segment(skb, features)))
2248                                 goto out_kfree_skb;
2249                         if (skb->next)
2250                                 goto gso;
2251                 } else {
2252                         if (skb_needs_linearize(skb, features) &&
2253                             __skb_linearize(skb))
2254                                 goto out_kfree_skb;
2255
2256                         /* If packet is not checksummed and device does not
2257                          * support checksumming for this protocol, complete
2258                          * checksumming here.
2259                          */
2260                         if (skb->ip_summed == CHECKSUM_PARTIAL) {
2261                                 skb_set_transport_header(skb,
2262                                         skb_checksum_start_offset(skb));
2263                                 if (!(features & NETIF_F_ALL_CSUM) &&
2264                                      skb_checksum_help(skb))
2265                                         goto out_kfree_skb;
2266                         }
2267                 }
2268
2269                 skb_len = skb->len;
2270                 rc = ops->ndo_start_xmit(skb, dev);
2271                 trace_net_dev_xmit(skb, rc, dev, skb_len);
2272                 if (rc == NETDEV_TX_OK)
2273                         txq_trans_update(txq);
2274                 return rc;
2275         }
2276
2277 gso:
2278         do {
2279                 struct sk_buff *nskb = skb->next;
2280
2281                 skb->next = nskb->next;
2282                 nskb->next = NULL;
2283
2284                 /*
2285                  * If device doesn't need nskb->dst, release it right now while
2286                  * its hot in this cpu cache
2287                  */
2288                 if (dev->priv_flags & IFF_XMIT_DST_RELEASE)
2289                         skb_dst_drop(nskb);
2290
2291                 skb_len = nskb->len;
2292                 rc = ops->ndo_start_xmit(nskb, dev);
2293                 trace_net_dev_xmit(nskb, rc, dev, skb_len);
2294                 if (unlikely(rc != NETDEV_TX_OK)) {
2295                         if (rc & ~NETDEV_TX_MASK)
2296                                 goto out_kfree_gso_skb;
2297                         nskb->next = skb->next;
2298                         skb->next = nskb;
2299                         return rc;
2300                 }
2301                 txq_trans_update(txq);
2302                 if (unlikely(netif_xmit_stopped(txq) && skb->next))
2303                         return NETDEV_TX_BUSY;
2304         } while (skb->next);
2305
2306 out_kfree_gso_skb:
2307         if (likely(skb->next == NULL))
2308                 skb->destructor = DEV_GSO_CB(skb)->destructor;
2309 out_kfree_skb:
2310         kfree_skb(skb);
2311 out:
2312         return rc;
2313 }
2314
2315 static u32 hashrnd __read_mostly;
2316
2317 /*
2318  * Returns a Tx hash based on the given packet descriptor a Tx queues' number
2319  * to be used as a distribution range.
2320  */
2321 u16 __skb_tx_hash(const struct net_device *dev, const struct sk_buff *skb,
2322                   unsigned int num_tx_queues)
2323 {
2324         u32 hash;
2325         u16 qoffset = 0;
2326         u16 qcount = num_tx_queues;
2327
2328         if (skb_rx_queue_recorded(skb)) {
2329                 hash = skb_get_rx_queue(skb);
2330                 while (unlikely(hash >= num_tx_queues))
2331                         hash -= num_tx_queues;
2332                 return hash;
2333         }
2334
2335         if (dev->num_tc) {
2336                 u8 tc = netdev_get_prio_tc_map(dev, skb->priority);
2337                 qoffset = dev->tc_to_txq[tc].offset;
2338                 qcount = dev->tc_to_txq[tc].count;
2339         }
2340
2341         if (skb->sk && skb->sk->sk_hash)
2342                 hash = skb->sk->sk_hash;
2343         else
2344                 hash = (__force u16) skb->protocol ^ skb->rxhash;
2345         hash = jhash_1word(hash, hashrnd);
2346
2347         return (u16) (((u64) hash * qcount) >> 32) + qoffset;
2348 }
2349 EXPORT_SYMBOL(__skb_tx_hash);
2350
2351 static inline u16 dev_cap_txqueue(struct net_device *dev, u16 queue_index)
2352 {
2353         if (unlikely(queue_index >= dev->real_num_tx_queues)) {
2354                 if (net_ratelimit()) {
2355                         pr_warning("%s selects TX queue %d, but "
2356                                 "real number of TX queues is %d\n",
2357                                 dev->name, queue_index, dev->real_num_tx_queues);
2358                 }
2359                 return 0;
2360         }
2361         return queue_index;
2362 }
2363
2364 static inline int get_xps_queue(struct net_device *dev, struct sk_buff *skb)
2365 {
2366 #ifdef CONFIG_XPS
2367         struct xps_dev_maps *dev_maps;
2368         struct xps_map *map;
2369         int queue_index = -1;
2370
2371         rcu_read_lock();
2372         dev_maps = rcu_dereference(dev->xps_maps);
2373         if (dev_maps) {
2374                 map = rcu_dereference(
2375                     dev_maps->cpu_map[raw_smp_processor_id()]);
2376                 if (map) {
2377                         if (map->len == 1)
2378                                 queue_index = map->queues[0];
2379                         else {
2380                                 u32 hash;
2381                                 if (skb->sk && skb->sk->sk_hash)
2382                                         hash = skb->sk->sk_hash;
2383                                 else
2384                                         hash = (__force u16) skb->protocol ^
2385                                             skb->rxhash;
2386                                 hash = jhash_1word(hash, hashrnd);
2387                                 queue_index = map->queues[
2388                                     ((u64)hash * map->len) >> 32];
2389                         }
2390                         if (unlikely(queue_index >= dev->real_num_tx_queues))
2391                                 queue_index = -1;
2392                 }
2393         }
2394         rcu_read_unlock();
2395
2396         return queue_index;
2397 #else
2398         return -1;
2399 #endif
2400 }
2401
2402 static struct netdev_queue *dev_pick_tx(struct net_device *dev,
2403                                         struct sk_buff *skb)
2404 {
2405         int queue_index;
2406         const struct net_device_ops *ops = dev->netdev_ops;
2407
2408         if (dev->real_num_tx_queues == 1)
2409                 queue_index = 0;
2410         else if (ops->ndo_select_queue) {
2411                 queue_index = ops->ndo_select_queue(dev, skb);
2412                 queue_index = dev_cap_txqueue(dev, queue_index);
2413         } else {
2414                 struct sock *sk = skb->sk;
2415                 queue_index = sk_tx_queue_get(sk);
2416
2417                 if (queue_index < 0 || skb->ooo_okay ||
2418                     queue_index >= dev->real_num_tx_queues) {
2419                         int old_index = queue_index;
2420
2421                         queue_index = get_xps_queue(dev, skb);
2422                         if (queue_index < 0)
2423                                 queue_index = skb_tx_hash(dev, skb);
2424
2425                         if (queue_index != old_index && sk) {
2426                                 struct dst_entry *dst =
2427                                     rcu_dereference_check(sk->sk_dst_cache, 1);
2428
2429                                 if (dst && skb_dst(skb) == dst)
2430                                         sk_tx_queue_set(sk, queue_index);
2431                         }
2432                 }
2433         }
2434
2435         skb_set_queue_mapping(skb, queue_index);
2436         return netdev_get_tx_queue(dev, queue_index);
2437 }
2438
2439 static inline int __dev_xmit_skb(struct sk_buff *skb, struct Qdisc *q,
2440                                  struct net_device *dev,
2441                                  struct netdev_queue *txq)
2442 {
2443         spinlock_t *root_lock = qdisc_lock(q);
2444         bool contended;
2445         int rc;
2446
2447         qdisc_skb_cb(skb)->pkt_len = skb->len;
2448         qdisc_calculate_pkt_len(skb, q);
2449         /*
2450          * Heuristic to force contended enqueues to serialize on a
2451          * separate lock before trying to get qdisc main lock.
2452          * This permits __QDISC_STATE_RUNNING owner to get the lock more often
2453          * and dequeue packets faster.
2454          */
2455         contended = qdisc_is_running(q);
2456         if (unlikely(contended))
2457                 spin_lock(&q->busylock);
2458
2459         spin_lock(root_lock);
2460         if (unlikely(test_bit(__QDISC_STATE_DEACTIVATED, &q->state))) {
2461                 kfree_skb(skb);
2462                 rc = NET_XMIT_DROP;
2463         } else if ((q->flags & TCQ_F_CAN_BYPASS) && !qdisc_qlen(q) &&
2464                    qdisc_run_begin(q)) {
2465                 /*
2466                  * This is a work-conserving queue; there are no old skbs
2467                  * waiting to be sent out; and the qdisc is not running -
2468                  * xmit the skb directly.
2469                  */
2470                 if (!(dev->priv_flags & IFF_XMIT_DST_RELEASE))
2471                         skb_dst_force(skb);
2472
2473                 qdisc_bstats_update(q, skb);
2474
2475                 if (sch_direct_xmit(skb, q, dev, txq, root_lock)) {
2476                         if (unlikely(contended)) {
2477                                 spin_unlock(&q->busylock);
2478                                 contended = false;
2479                         }
2480                         __qdisc_run(q);
2481                 } else
2482                         qdisc_run_end(q);
2483
2484                 rc = NET_XMIT_SUCCESS;
2485         } else {
2486                 skb_dst_force(skb);
2487                 rc = q->enqueue(skb, q) & NET_XMIT_MASK;
2488                 if (qdisc_run_begin(q)) {
2489                         if (unlikely(contended)) {
2490                                 spin_unlock(&q->busylock);
2491                                 contended = false;
2492                         }
2493                         __qdisc_run(q);
2494                 }
2495         }
2496         spin_unlock(root_lock);
2497         if (unlikely(contended))
2498                 spin_unlock(&q->busylock);
2499         return rc;
2500 }
2501
2502 #if IS_ENABLED(CONFIG_NETPRIO_CGROUP)
2503 static void skb_update_prio(struct sk_buff *skb)
2504 {
2505         struct netprio_map *map = rcu_dereference_bh(skb->dev->priomap);
2506
2507         if ((!skb->priority) && (skb->sk) && map)
2508                 skb->priority = map->priomap[skb->sk->sk_cgrp_prioidx];
2509 }
2510 #else
2511 #define skb_update_prio(skb)
2512 #endif
2513
2514 static DEFINE_PER_CPU(int, xmit_recursion);
2515 #define RECURSION_LIMIT 10
2516
2517 /**
2518  *      dev_queue_xmit - transmit a buffer
2519  *      @skb: buffer to transmit
2520  *
2521  *      Queue a buffer for transmission to a network device. The caller must
2522  *      have set the device and priority and built the buffer before calling
2523  *      this function. The function can be called from an interrupt.
2524  *
2525  *      A negative errno code is returned on a failure. A success does not
2526  *      guarantee the frame will be transmitted as it may be dropped due
2527  *      to congestion or traffic shaping.
2528  *
2529  * -----------------------------------------------------------------------------------
2530  *      I notice this method can also return errors from the queue disciplines,
2531  *      including NET_XMIT_DROP, which is a positive value.  So, errors can also
2532  *      be positive.
2533  *
2534  *      Regardless of the return value, the skb is consumed, so it is currently
2535  *      difficult to retry a send to this method.  (You can bump the ref count
2536  *      before sending to hold a reference for retry if you are careful.)
2537  *
2538  *      When calling this method, interrupts MUST be enabled.  This is because
2539  *      the BH enable code must have IRQs enabled so that it will not deadlock.
2540  *          --BLG
2541  */
2542 int dev_queue_xmit(struct sk_buff *skb)
2543 {
2544         struct net_device *dev = skb->dev;
2545         struct netdev_queue *txq;
2546         struct Qdisc *q;
2547         int rc = -ENOMEM;
2548
2549         /* Disable soft irqs for various locks below. Also
2550          * stops preemption for RCU.
2551          */
2552         rcu_read_lock_bh();
2553
2554         skb_update_prio(skb);
2555
2556         txq = dev_pick_tx(dev, skb);
2557         q = rcu_dereference_bh(txq->qdisc);
2558
2559 #ifdef CONFIG_NET_CLS_ACT
2560         skb->tc_verd = SET_TC_AT(skb->tc_verd, AT_EGRESS);
2561 #endif
2562         trace_net_dev_queue(skb);
2563         if (q->enqueue) {
2564                 rc = __dev_xmit_skb(skb, q, dev, txq);
2565                 goto out;
2566         }
2567
2568         /* The device has no queue. Common case for software devices:
2569            loopback, all the sorts of tunnels...
2570
2571            Really, it is unlikely that netif_tx_lock protection is necessary
2572            here.  (f.e. loopback and IP tunnels are clean ignoring statistics
2573            counters.)
2574            However, it is possible, that they rely on protection
2575            made by us here.
2576
2577            Check this and shot the lock. It is not prone from deadlocks.
2578            Either shot noqueue qdisc, it is even simpler 8)
2579          */
2580         if (dev->flags & IFF_UP) {
2581                 int cpu = smp_processor_id(); /* ok because BHs are off */
2582
2583                 if (txq->xmit_lock_owner != cpu) {
2584
2585                         if (__this_cpu_read(xmit_recursion) > RECURSION_LIMIT)
2586                                 goto recursion_alert;
2587
2588                         HARD_TX_LOCK(dev, txq, cpu);
2589
2590                         if (!netif_xmit_stopped(txq)) {
2591                                 __this_cpu_inc(xmit_recursion);
2592                                 rc = dev_hard_start_xmit(skb, dev, txq);
2593                                 __this_cpu_dec(xmit_recursion);
2594                                 if (dev_xmit_complete(rc)) {
2595                                         HARD_TX_UNLOCK(dev, txq);
2596                                         goto out;
2597                                 }
2598                         }
2599                         HARD_TX_UNLOCK(dev, txq);
2600                         if (net_ratelimit())
2601                                 printk(KERN_CRIT "Virtual device %s asks to "
2602                                        "queue packet!\n", dev->name);
2603                 } else {
2604                         /* Recursion is detected! It is possible,
2605                          * unfortunately
2606                          */
2607 recursion_alert:
2608                         if (net_ratelimit())
2609                                 printk(KERN_CRIT "Dead loop on virtual device "
2610                                        "%s, fix it urgently!\n", dev->name);
2611                 }
2612         }
2613
2614         rc = -ENETDOWN;
2615         rcu_read_unlock_bh();
2616
2617         kfree_skb(skb);
2618         return rc;
2619 out:
2620         rcu_read_unlock_bh();
2621         return rc;
2622 }
2623 EXPORT_SYMBOL(dev_queue_xmit);
2624
2625
2626 /*=======================================================================
2627                         Receiver routines
2628   =======================================================================*/
2629
2630 int netdev_max_backlog __read_mostly = 1000;
2631 int netdev_tstamp_prequeue __read_mostly = 1;
2632 int netdev_budget __read_mostly = 300;
2633 int weight_p __read_mostly = 64;            /* old backlog weight */
2634
2635 /* Called with irq disabled */
2636 static inline void ____napi_schedule(struct softnet_data *sd,
2637                                      struct napi_struct *napi)
2638 {
2639         list_add_tail(&napi->poll_list, &sd->poll_list);
2640         __raise_softirq_irqoff(NET_RX_SOFTIRQ);
2641 }
2642
2643 /*
2644  * __skb_get_rxhash: calculate a flow hash based on src/dst addresses
2645  * and src/dst port numbers.  Sets rxhash in skb to non-zero hash value
2646  * on success, zero indicates no valid hash.  Also, sets l4_rxhash in skb
2647  * if hash is a canonical 4-tuple hash over transport ports.
2648  */
2649 void __skb_get_rxhash(struct sk_buff *skb)
2650 {
2651         struct flow_keys keys;
2652         u32 hash;
2653
2654         if (!skb_flow_dissect(skb, &keys))
2655                 return;
2656
2657         if (keys.ports) {
2658                 if ((__force u16)keys.port16[1] < (__force u16)keys.port16[0])
2659                         swap(keys.port16[0], keys.port16[1]);
2660                 skb->l4_rxhash = 1;
2661         }
2662
2663         /* get a consistent hash (same value on both flow directions) */
2664         if ((__force u32)keys.dst < (__force u32)keys.src)
2665                 swap(keys.dst, keys.src);
2666
2667         hash = jhash_3words((__force u32)keys.dst,
2668                             (__force u32)keys.src,
2669                             (__force u32)keys.ports, hashrnd);
2670         if (!hash)
2671                 hash = 1;
2672
2673         skb->rxhash = hash;
2674 }
2675 EXPORT_SYMBOL(__skb_get_rxhash);
2676
2677 #ifdef CONFIG_RPS
2678
2679 /* One global table that all flow-based protocols share. */
2680 struct rps_sock_flow_table __rcu *rps_sock_flow_table __read_mostly;
2681 EXPORT_SYMBOL(rps_sock_flow_table);
2682
2683 struct jump_label_key rps_needed __read_mostly;
2684
2685 static struct rps_dev_flow *
2686 set_rps_cpu(struct net_device *dev, struct sk_buff *skb,
2687             struct rps_dev_flow *rflow, u16 next_cpu)
2688 {
2689         if (next_cpu != RPS_NO_CPU) {
2690 #ifdef CONFIG_RFS_ACCEL
2691                 struct netdev_rx_queue *rxqueue;
2692                 struct rps_dev_flow_table *flow_table;
2693                 struct rps_dev_flow *old_rflow;
2694                 u32 flow_id;
2695                 u16 rxq_index;
2696                 int rc;
2697
2698                 /* Should we steer this flow to a different hardware queue? */
2699                 if (!skb_rx_queue_recorded(skb) || !dev->rx_cpu_rmap ||
2700                     !(dev->features & NETIF_F_NTUPLE))
2701                         goto out;
2702                 rxq_index = cpu_rmap_lookup_index(dev->rx_cpu_rmap, next_cpu);
2703                 if (rxq_index == skb_get_rx_queue(skb))
2704                         goto out;
2705
2706                 rxqueue = dev->_rx + rxq_index;
2707                 flow_table = rcu_dereference(rxqueue->rps_flow_table);
2708                 if (!flow_table)
2709                         goto out;
2710                 flow_id = skb->rxhash & flow_table->mask;
2711                 rc = dev->netdev_ops->ndo_rx_flow_steer(dev, skb,
2712                                                         rxq_index, flow_id);
2713                 if (rc < 0)
2714                         goto out;
2715                 old_rflow = rflow;
2716                 rflow = &flow_table->flows[flow_id];
2717                 rflow->filter = rc;
2718                 if (old_rflow->filter == rflow->filter)
2719                         old_rflow->filter = RPS_NO_FILTER;
2720         out:
2721 #endif
2722                 rflow->last_qtail =
2723                         per_cpu(softnet_data, next_cpu).input_queue_head;
2724         }
2725
2726         rflow->cpu = next_cpu;
2727         return rflow;
2728 }
2729
2730 /*
2731  * get_rps_cpu is called from netif_receive_skb and returns the target
2732  * CPU from the RPS map of the receiving queue for a given skb.
2733  * rcu_read_lock must be held on entry.
2734  */
2735 static int get_rps_cpu(struct net_device *dev, struct sk_buff *skb,
2736                        struct rps_dev_flow **rflowp)
2737 {
2738         struct netdev_rx_queue *rxqueue;
2739         struct rps_map *map;
2740         struct rps_dev_flow_table *flow_table;
2741         struct rps_sock_flow_table *sock_flow_table;
2742         int cpu = -1;
2743         u16 tcpu;
2744
2745         if (skb_rx_queue_recorded(skb)) {
2746                 u16 index = skb_get_rx_queue(skb);
2747                 if (unlikely(index >= dev->real_num_rx_queues)) {
2748                         WARN_ONCE(dev->real_num_rx_queues > 1,
2749                                   "%s received packet on queue %u, but number "
2750                                   "of RX queues is %u\n",
2751                                   dev->name, index, dev->real_num_rx_queues);
2752                         goto done;
2753                 }
2754                 rxqueue = dev->_rx + index;
2755         } else
2756                 rxqueue = dev->_rx;
2757
2758         map = rcu_dereference(rxqueue->rps_map);
2759         if (map) {
2760                 if (map->len == 1 &&
2761                     !rcu_access_pointer(rxqueue->rps_flow_table)) {
2762                         tcpu = map->cpus[0];
2763                         if (cpu_online(tcpu))
2764                                 cpu = tcpu;
2765                         goto done;
2766                 }
2767         } else if (!rcu_access_pointer(rxqueue->rps_flow_table)) {
2768                 goto done;
2769         }
2770
2771         skb_reset_network_header(skb);
2772         if (!skb_get_rxhash(skb))
2773                 goto done;
2774
2775         flow_table = rcu_dereference(rxqueue->rps_flow_table);
2776         sock_flow_table = rcu_dereference(rps_sock_flow_table);
2777         if (flow_table && sock_flow_table) {
2778                 u16 next_cpu;
2779                 struct rps_dev_flow *rflow;
2780
2781                 rflow = &flow_table->flows[skb->rxhash & flow_table->mask];
2782                 tcpu = rflow->cpu;
2783
2784                 next_cpu = sock_flow_table->ents[skb->rxhash &
2785                     sock_flow_table->mask];
2786
2787                 /*
2788                  * If the desired CPU (where last recvmsg was done) is
2789                  * different from current CPU (one in the rx-queue flow
2790                  * table entry), switch if one of the following holds:
2791                  *   - Current CPU is unset (equal to RPS_NO_CPU).
2792                  *   - Current CPU is offline.
2793                  *   - The current CPU's queue tail has advanced beyond the
2794                  *     last packet that was enqueued using this table entry.
2795                  *     This guarantees that all previous packets for the flow
2796                  *     have been dequeued, thus preserving in order delivery.
2797                  */
2798                 if (unlikely(tcpu != next_cpu) &&
2799                     (tcpu == RPS_NO_CPU || !cpu_online(tcpu) ||
2800                      ((int)(per_cpu(softnet_data, tcpu).input_queue_head -
2801                       rflow->last_qtail)) >= 0))
2802                         rflow = set_rps_cpu(dev, skb, rflow, next_cpu);
2803
2804                 if (tcpu != RPS_NO_CPU && cpu_online(tcpu)) {
2805                         *rflowp = rflow;
2806                         cpu = tcpu;
2807                         goto done;
2808                 }
2809         }
2810
2811         if (map) {
2812                 tcpu = map->cpus[((u64) skb->rxhash * map->len) >> 32];
2813
2814                 if (cpu_online(tcpu)) {
2815                         cpu = tcpu;
2816                         goto done;
2817                 }
2818         }
2819
2820 done:
2821         return cpu;
2822 }
2823
2824 #ifdef CONFIG_RFS_ACCEL
2825
2826 /**
2827  * rps_may_expire_flow - check whether an RFS hardware filter may be removed
2828  * @dev: Device on which the filter was set
2829  * @rxq_index: RX queue index
2830  * @flow_id: Flow ID passed to ndo_rx_flow_steer()
2831  * @filter_id: Filter ID returned by ndo_rx_flow_steer()
2832  *
2833  * Drivers that implement ndo_rx_flow_steer() should periodically call
2834  * this function for each installed filter and remove the filters for
2835  * which it returns %true.
2836  */
2837 bool rps_may_expire_flow(struct net_device *dev, u16 rxq_index,
2838                          u32 flow_id, u16 filter_id)
2839 {
2840         struct netdev_rx_queue *rxqueue = dev->_rx + rxq_index;
2841         struct rps_dev_flow_table *flow_table;
2842         struct rps_dev_flow *rflow;
2843         bool expire = true;
2844         int cpu;
2845
2846         rcu_read_lock();
2847         flow_table = rcu_dereference(rxqueue->rps_flow_table);
2848         if (flow_table && flow_id <= flow_table->mask) {
2849                 rflow = &flow_table->flows[flow_id];
2850                 cpu = ACCESS_ONCE(rflow->cpu);
2851                 if (rflow->filter == filter_id && cpu != RPS_NO_CPU &&
2852                     ((int)(per_cpu(softnet_data, cpu).input_queue_head -
2853                            rflow->last_qtail) <
2854                      (int)(10 * flow_table->mask)))
2855                         expire = false;
2856         }
2857         rcu_read_unlock();
2858         return expire;
2859 }
2860 EXPORT_SYMBOL(rps_may_expire_flow);
2861
2862 #endif /* CONFIG_RFS_ACCEL */
2863
2864 /* Called from hardirq (IPI) context */
2865 static void rps_trigger_softirq(void *data)
2866 {
2867         struct softnet_data *sd = data;
2868
2869         ____napi_schedule(sd, &sd->backlog);
2870         sd->received_rps++;
2871 }
2872
2873 #endif /* CONFIG_RPS */
2874
2875 /*
2876  * Check if this softnet_data structure is another cpu one
2877  * If yes, queue it to our IPI list and return 1
2878  * If no, return 0
2879  */
2880 static int rps_ipi_queued(struct softnet_data *sd)
2881 {
2882 #ifdef CONFIG_RPS
2883         struct softnet_data *mysd = &__get_cpu_var(softnet_data);
2884
2885         if (sd != mysd) {
2886                 sd->rps_ipi_next = mysd->rps_ipi_list;
2887                 mysd->rps_ipi_list = sd;
2888
2889                 __raise_softirq_irqoff(NET_RX_SOFTIRQ);
2890                 return 1;
2891         }
2892 #endif /* CONFIG_RPS */
2893         return 0;
2894 }
2895
2896 /*
2897  * enqueue_to_backlog is called to queue an skb to a per CPU backlog
2898  * queue (may be a remote CPU queue).
2899  */
2900 static int enqueue_to_backlog(struct sk_buff *skb, int cpu,
2901                               unsigned int *qtail)
2902 {
2903         struct softnet_data *sd;
2904         unsigned long flags;
2905
2906         sd = &per_cpu(softnet_data, cpu);
2907
2908         local_irq_save(flags);
2909
2910         rps_lock(sd);
2911         if (skb_queue_len(&sd->input_pkt_queue) <= netdev_max_backlog) {
2912                 if (skb_queue_len(&sd->input_pkt_queue)) {
2913 enqueue:
2914                         __skb_queue_tail(&sd->input_pkt_queue, skb);
2915                         input_queue_tail_incr_save(sd, qtail);
2916                         rps_unlock(sd);
2917                         local_irq_restore(flags);
2918                         return NET_RX_SUCCESS;
2919                 }
2920
2921                 /* Schedule NAPI for backlog device
2922                  * We can use non atomic operation since we own the queue lock
2923                  */
2924                 if (!__test_and_set_bit(NAPI_STATE_SCHED, &sd->backlog.state)) {
2925                         if (!rps_ipi_queued(sd))
2926                                 ____napi_schedule(sd, &sd->backlog);
2927                 }
2928                 goto enqueue;
2929         }
2930
2931         sd->dropped++;
2932         rps_unlock(sd);
2933
2934         local_irq_restore(flags);
2935
2936         atomic_long_inc(&skb->dev->rx_dropped);
2937         kfree_skb(skb);
2938         return NET_RX_DROP;
2939 }
2940
2941 /**
2942  *      netif_rx        -       post buffer to the network code
2943  *      @skb: buffer to post
2944  *
2945  *      This function receives a packet from a device driver and queues it for
2946  *      the upper (protocol) levels to process.  It always succeeds. The buffer
2947  *      may be dropped during processing for congestion control or by the
2948  *      protocol layers.
2949  *
2950  *      return values:
2951  *      NET_RX_SUCCESS  (no congestion)
2952  *      NET_RX_DROP     (packet was dropped)
2953  *
2954  */
2955
2956 int netif_rx(struct sk_buff *skb)
2957 {
2958         int ret;
2959
2960         /* if netpoll wants it, pretend we never saw it */
2961         if (netpoll_rx(skb))
2962                 return NET_RX_DROP;
2963
2964         net_timestamp_check(netdev_tstamp_prequeue, skb);
2965
2966         trace_netif_rx(skb);
2967 #ifdef CONFIG_RPS
2968         if (static_branch(&rps_needed)) {
2969                 struct rps_dev_flow voidflow, *rflow = &voidflow;
2970                 int cpu;
2971
2972                 preempt_disable();
2973                 rcu_read_lock();
2974
2975                 cpu = get_rps_cpu(skb->dev, skb, &rflow);
2976                 if (cpu < 0)
2977                         cpu = smp_processor_id();
2978
2979                 ret = enqueue_to_backlog(skb, cpu, &rflow->last_qtail);
2980
2981                 rcu_read_unlock();
2982                 preempt_enable();
2983         } else
2984 #endif
2985         {
2986                 unsigned int qtail;
2987                 ret = enqueue_to_backlog(skb, get_cpu(), &qtail);
2988                 put_cpu();
2989         }
2990         return ret;
2991 }
2992 EXPORT_SYMBOL(netif_rx);
2993
2994 int netif_rx_ni(struct sk_buff *skb)
2995 {
2996         int err;
2997
2998         preempt_disable();
2999         err = netif_rx(skb);
3000         if (local_softirq_pending())
3001                 do_softirq();
3002         preempt_enable();
3003
3004         return err;
3005 }
3006 EXPORT_SYMBOL(netif_rx_ni);
3007
3008 static void net_tx_action(struct softirq_action *h)
3009 {
3010         struct softnet_data *sd = &__get_cpu_var(softnet_data);
3011
3012         if (sd->completion_queue) {
3013                 struct sk_buff *clist;
3014
3015                 local_irq_disable();
3016                 clist = sd->completion_queue;
3017                 sd->completion_queue = NULL;
3018                 local_irq_enable();
3019
3020                 while (clist) {
3021                         struct sk_buff *skb = clist;
3022                         clist = clist->next;
3023
3024                         WARN_ON(atomic_read(&skb->users));
3025                         trace_kfree_skb(skb, net_tx_action);
3026                         __kfree_skb(skb);
3027                 }
3028         }
3029
3030         if (sd->output_queue) {
3031                 struct Qdisc *head;
3032
3033                 local_irq_disable();
3034                 head = sd->output_queue;
3035                 sd->output_queue = NULL;
3036                 sd->output_queue_tailp = &sd->output_queue;
3037                 local_irq_enable();
3038
3039                 while (head) {
3040                         struct Qdisc *q = head;
3041                         spinlock_t *root_lock;
3042
3043                         head = head->next_sched;
3044
3045                         root_lock = qdisc_lock(q);
3046                         if (spin_trylock(root_lock)) {
3047                                 smp_mb__before_clear_bit();
3048                                 clear_bit(__QDISC_STATE_SCHED,
3049                                           &q->state);
3050                                 qdisc_run(q);
3051                                 spin_unlock(root_lock);
3052                         } else {
3053                                 if (!test_bit(__QDISC_STATE_DEACTIVATED,
3054                                               &q->state)) {
3055                                         __netif_reschedule(q);
3056                                 } else {
3057                                         smp_mb__before_clear_bit();
3058                                         clear_bit(__QDISC_STATE_SCHED,
3059                                                   &q->state);
3060                                 }
3061                         }
3062                 }
3063         }
3064 }
3065
3066 #if (defined(CONFIG_BRIDGE) || defined(CONFIG_BRIDGE_MODULE)) && \
3067     (defined(CONFIG_ATM_LANE) || defined(CONFIG_ATM_LANE_MODULE))
3068 /* This hook is defined here for ATM LANE */
3069 int (*br_fdb_test_addr_hook)(struct net_device *dev,
3070                              unsigned char *addr) __read_mostly;
3071 EXPORT_SYMBOL_GPL(br_fdb_test_addr_hook);
3072 #endif
3073
3074 #ifdef CONFIG_NET_CLS_ACT
3075 /* TODO: Maybe we should just force sch_ingress to be compiled in
3076  * when CONFIG_NET_CLS_ACT is? otherwise some useless instructions
3077  * a compare and 2 stores extra right now if we dont have it on
3078  * but have CONFIG_NET_CLS_ACT
3079  * NOTE: This doesn't stop any functionality; if you dont have
3080  * the ingress scheduler, you just can't add policies on ingress.
3081  *
3082  */
3083 static int ing_filter(struct sk_buff *skb, struct netdev_queue *rxq)
3084 {
3085         struct net_device *dev = skb->dev;
3086         u32 ttl = G_TC_RTTL(skb->tc_verd);
3087         int result = TC_ACT_OK;
3088         struct Qdisc *q;
3089
3090         if (unlikely(MAX_RED_LOOP < ttl++)) {
3091                 if (net_ratelimit())
3092                         pr_warning( "Redir loop detected Dropping packet (%d->%d)\n",
3093                                skb->skb_iif, dev->ifindex);
3094                 return TC_ACT_SHOT;
3095         }
3096
3097         skb->tc_verd = SET_TC_RTTL(skb->tc_verd, ttl);
3098         skb->tc_verd = SET_TC_AT(skb->tc_verd, AT_INGRESS);
3099
3100         q = rxq->qdisc;
3101         if (q != &noop_qdisc) {
3102                 spin_lock(qdisc_lock(q));
3103                 if (likely(!test_bit(__QDISC_STATE_DEACTIVATED, &q->state)))
3104                         result = qdisc_enqueue_root(skb, q);
3105                 spin_unlock(qdisc_lock(q));
3106         }
3107
3108         return result;
3109 }
3110
3111 static inline struct sk_buff *handle_ing(struct sk_buff *skb,
3112                                          struct packet_type **pt_prev,
3113                                          int *ret, struct net_device *orig_dev)
3114 {
3115         struct netdev_queue *rxq = rcu_dereference(skb->dev->ingress_queue);
3116
3117         if (!rxq || rxq->qdisc == &noop_qdisc)
3118                 goto out;
3119
3120         if (*pt_prev) {
3121                 *ret = deliver_skb(skb, *pt_prev, orig_dev);
3122                 *pt_prev = NULL;
3123         }
3124
3125         switch (ing_filter(skb, rxq)) {
3126         case TC_ACT_SHOT:
3127         case TC_ACT_STOLEN:
3128                 kfree_skb(skb);
3129                 return NULL;
3130         }
3131
3132 out:
3133         skb->tc_verd = 0;
3134         return skb;
3135 }
3136 #endif
3137
3138 /**
3139  *      netdev_rx_handler_register - register receive handler
3140  *      @dev: device to register a handler for
3141  *      @rx_handler: receive handler to register
3142  *      @rx_handler_data: data pointer that is used by rx handler
3143  *
3144  *      Register a receive hander for a device. This handler will then be
3145  *      called from __netif_receive_skb. A negative errno code is returned
3146  *      on a failure.
3147  *
3148  *      The caller must hold the rtnl_mutex.
3149  *
3150  *      For a general description of rx_handler, see enum rx_handler_result.
3151  */
3152 int netdev_rx_handler_register(struct net_device *dev,
3153                                rx_handler_func_t *rx_handler,
3154                                void *rx_handler_data)
3155 {
3156         ASSERT_RTNL();
3157
3158         if (dev->rx_handler)
3159                 return -EBUSY;
3160
3161         rcu_assign_pointer(dev->rx_handler_data, rx_handler_data);
3162         rcu_assign_pointer(dev->rx_handler, rx_handler);
3163
3164         return 0;
3165 }
3166 EXPORT_SYMBOL_GPL(netdev_rx_handler_register);
3167
3168 /**
3169  *      netdev_rx_handler_unregister - unregister receive handler
3170  *      @dev: device to unregister a handler from
3171  *
3172  *      Unregister a receive hander from a device.
3173  *
3174  *      The caller must hold the rtnl_mutex.
3175  */
3176 void netdev_rx_handler_unregister(struct net_device *dev)
3177 {
3178
3179         ASSERT_RTNL();
3180         RCU_INIT_POINTER(dev->rx_handler, NULL);
3181         RCU_INIT_POINTER(dev->rx_handler_data, NULL);
3182 }
3183 EXPORT_SYMBOL_GPL(netdev_rx_handler_unregister);
3184
3185 static int __netif_receive_skb(struct sk_buff *skb)
3186 {
3187         struct packet_type *ptype, *pt_prev;
3188         rx_handler_func_t *rx_handler;
3189         struct net_device *orig_dev;
3190         struct net_device *null_or_dev;
3191         bool deliver_exact = false;
3192         int ret = NET_RX_DROP;
3193         __be16 type;
3194
3195         net_timestamp_check(!netdev_tstamp_prequeue, skb);
3196
3197         trace_netif_receive_skb(skb);
3198
3199         /* if we've gotten here through NAPI, check netpoll */
3200         if (netpoll_receive_skb(skb))
3201                 return NET_RX_DROP;
3202
3203         if (!skb->skb_iif)
3204                 skb->skb_iif = skb->dev->ifindex;
3205         orig_dev = skb->dev;
3206
3207         skb_reset_network_header(skb);
3208         skb_reset_transport_header(skb);
3209         skb_reset_mac_len(skb);
3210
3211         pt_prev = NULL;
3212
3213         rcu_read_lock();
3214
3215 another_round:
3216
3217         __this_cpu_inc(softnet_data.processed);
3218
3219         if (skb->protocol == cpu_to_be16(ETH_P_8021Q)) {
3220                 skb = vlan_untag(skb);
3221                 if (unlikely(!skb))
3222                         goto out;
3223         }
3224
3225 #ifdef CONFIG_NET_CLS_ACT
3226         if (skb->tc_verd & TC_NCLS) {
3227                 skb->tc_verd = CLR_TC_NCLS(skb->tc_verd);
3228                 goto ncls;
3229         }
3230 #endif
3231
3232         list_for_each_entry_rcu(ptype, &ptype_all, list) {
3233                 if (!ptype->dev || ptype->dev == skb->dev) {
3234                         if (pt_prev)
3235                                 ret = deliver_skb(skb, pt_prev, orig_dev);
3236                         pt_prev = ptype;
3237                 }
3238         }
3239
3240 #ifdef CONFIG_NET_CLS_ACT
3241         skb = handle_ing(skb, &pt_prev, &ret, orig_dev);
3242         if (!skb)
3243                 goto out;
3244 ncls:
3245 #endif
3246
3247         rx_handler = rcu_dereference(skb->dev->rx_handler);
3248         if (vlan_tx_tag_present(skb)) {
3249                 if (pt_prev) {
3250                         ret = deliver_skb(skb, pt_prev, orig_dev);
3251                         pt_prev = NULL;
3252                 }
3253                 if (vlan_do_receive(&skb, !rx_handler))
3254                         goto another_round;
3255                 else if (unlikely(!skb))
3256                         goto out;
3257         }
3258
3259         if (rx_handler) {
3260                 if (pt_prev) {
3261                         ret = deliver_skb(skb, pt_prev, orig_dev);
3262                         pt_prev = NULL;
3263                 }
3264                 switch (rx_handler(&skb)) {
3265                 case RX_HANDLER_CONSUMED:
3266                         goto out;
3267                 case RX_HANDLER_ANOTHER:
3268                         goto another_round;
3269                 case RX_HANDLER_EXACT:
3270                         deliver_exact = true;
3271                 case RX_HANDLER_PASS:
3272                         break;
3273                 default:
3274                         BUG();
3275                 }
3276         }
3277
3278         /* deliver only exact match when indicated */
3279         null_or_dev = deliver_exact ? skb->dev : NULL;
3280
3281         type = skb->protocol;
3282         list_for_each_entry_rcu(ptype,
3283                         &ptype_base[ntohs(type) & PTYPE_HASH_MASK], list) {
3284                 if (ptype->type == type &&
3285                     (ptype->dev == null_or_dev || ptype->dev == skb->dev ||
3286                      ptype->dev == orig_dev)) {
3287                         if (pt_prev)
3288                                 ret = deliver_skb(skb, pt_prev, orig_dev);
3289                         pt_prev = ptype;
3290                 }
3291         }
3292
3293         if (pt_prev) {
3294                 ret = pt_prev->func(skb, skb->dev, pt_prev, orig_dev);
3295         } else {
3296                 atomic_long_inc(&skb->dev->rx_dropped);
3297                 kfree_skb(skb);
3298                 /* Jamal, now you will not able to escape explaining
3299                  * me how you were going to use this. :-)
3300                  */
3301                 ret = NET_RX_DROP;
3302         }
3303
3304 out:
3305         rcu_read_unlock();
3306         return ret;
3307 }
3308
3309 /**
3310  *      netif_receive_skb - process receive buffer from network
3311  *      @skb: buffer to process
3312  *
3313  *      netif_receive_skb() is the main receive data processing function.
3314  *      It always succeeds. The buffer may be dropped during processing
3315  *      for congestion control or by the protocol layers.
3316  *
3317  *      This function may only be called from softirq context and interrupts
3318  *      should be enabled.
3319  *
3320  *      Return values (usually ignored):
3321  *      NET_RX_SUCCESS: no congestion
3322  *      NET_RX_DROP: packet was dropped
3323  */
3324 int netif_receive_skb(struct sk_buff *skb)
3325 {
3326         net_timestamp_check(netdev_tstamp_prequeue, skb);
3327
3328         if (skb_defer_rx_timestamp(skb))
3329                 return NET_RX_SUCCESS;
3330
3331 #ifdef CONFIG_RPS
3332         if (static_branch(&rps_needed)) {
3333                 struct rps_dev_flow voidflow, *rflow = &voidflow;
3334                 int cpu, ret;
3335
3336                 rcu_read_lock();
3337
3338                 cpu = get_rps_cpu(skb->dev, skb, &rflow);
3339
3340                 if (cpu >= 0) {
3341                         ret = enqueue_to_backlog(skb, cpu, &rflow->last_qtail);
3342                         rcu_read_unlock();
3343                         return ret;
3344                 }
3345                 rcu_read_unlock();
3346         }
3347 #endif
3348         return __netif_receive_skb(skb);
3349 }
3350 EXPORT_SYMBOL(netif_receive_skb);
3351
3352 /* Network device is going away, flush any packets still pending
3353  * Called with irqs disabled.
3354  */
3355 static void flush_backlog(void *arg)
3356 {
3357         struct net_device *dev = arg;
3358         struct softnet_data *sd = &__get_cpu_var(softnet_data);
3359         struct sk_buff *skb, *tmp;
3360
3361         rps_lock(sd);
3362         skb_queue_walk_safe(&sd->input_pkt_queue, skb, tmp) {
3363                 if (skb->dev == dev) {
3364                         __skb_unlink(skb, &sd->input_pkt_queue);
3365                         kfree_skb(skb);
3366                         input_queue_head_incr(sd);
3367                 }
3368         }
3369         rps_unlock(sd);
3370
3371         skb_queue_walk_safe(&sd->process_queue, skb, tmp) {
3372                 if (skb->dev == dev) {
3373                         __skb_unlink(skb, &sd->process_queue);
3374                         kfree_skb(skb);
3375                         input_queue_head_incr(sd);
3376                 }
3377         }
3378 }
3379
3380 static int napi_gro_complete(struct sk_buff *skb)
3381 {
3382         struct packet_type *ptype;
3383         __be16 type = skb->protocol;
3384         struct list_head *head = &ptype_base[ntohs(type) & PTYPE_HASH_MASK];
3385         int err = -ENOENT;
3386
3387         if (NAPI_GRO_CB(skb)->count == 1) {
3388                 skb_shinfo(skb)->gso_size = 0;
3389                 goto out;
3390         }
3391
3392         rcu_read_lock();
3393         list_for_each_entry_rcu(ptype, head, list) {
3394                 if (ptype->type != type || ptype->dev || !ptype->gro_complete)
3395                         continue;
3396
3397                 err = ptype->gro_complete(skb);
3398                 break;
3399         }
3400         rcu_read_unlock();
3401
3402         if (err) {
3403                 WARN_ON(&ptype->list == head);
3404                 kfree_skb(skb);
3405                 return NET_RX_SUCCESS;
3406         }
3407
3408 out:
3409         return netif_receive_skb(skb);
3410 }
3411
3412 inline void napi_gro_flush(struct napi_struct *napi)
3413 {
3414         struct sk_buff *skb, *next;
3415
3416         for (skb = napi->gro_list; skb; skb = next) {
3417                 next = skb->next;
3418                 skb->next = NULL;
3419                 napi_gro_complete(skb);
3420         }
3421
3422         napi->gro_count = 0;
3423         napi->gro_list = NULL;
3424 }
3425 EXPORT_SYMBOL(napi_gro_flush);
3426
3427 enum gro_result dev_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
3428 {
3429         struct sk_buff **pp = NULL;
3430         struct packet_type *ptype;
3431         __be16 type = skb->protocol;
3432         struct list_head *head = &ptype_base[ntohs(type) & PTYPE_HASH_MASK];
3433         int same_flow;
3434         int mac_len;
3435         enum gro_result ret;
3436
3437         if (!(skb->dev->features & NETIF_F_GRO) || netpoll_rx_on(skb))
3438                 goto normal;
3439
3440         if (skb_is_gso(skb) || skb_has_frag_list(skb))
3441                 goto normal;
3442
3443         rcu_read_lock();
3444         list_for_each_entry_rcu(ptype, head, list) {
3445                 if (ptype->type != type || ptype->dev || !ptype->gro_receive)
3446                         continue;
3447
3448                 skb_set_network_header(skb, skb_gro_offset(skb));
3449                 mac_len = skb->network_header - skb->mac_header;
3450                 skb->mac_len = mac_len;
3451                 NAPI_GRO_CB(skb)->same_flow = 0;
3452                 NAPI_GRO_CB(skb)->flush = 0;
3453                 NAPI_GRO_CB(skb)->free = 0;
3454
3455                 pp = ptype->gro_receive(&napi->gro_list, skb);
3456                 break;
3457         }
3458         rcu_read_unlock();
3459
3460         if (&ptype->list == head)
3461                 goto normal;
3462
3463         same_flow = NAPI_GRO_CB(skb)->same_flow;
3464         ret = NAPI_GRO_CB(skb)->free ? GRO_MERGED_FREE : GRO_MERGED;
3465
3466         if (pp) {
3467                 struct sk_buff *nskb = *pp;
3468
3469                 *pp = nskb->next;
3470                 nskb->next = NULL;
3471                 napi_gro_complete(nskb);
3472                 napi->gro_count--;
3473         }
3474
3475         if (same_flow)
3476                 goto ok;
3477
3478         if (NAPI_GRO_CB(skb)->flush || napi->gro_count >= MAX_GRO_SKBS)
3479                 goto normal;
3480
3481         napi->gro_count++;
3482         NAPI_GRO_CB(skb)->count = 1;
3483         skb_shinfo(skb)->gso_size = skb_gro_len(skb);
3484         skb->next = napi->gro_list;
3485         napi->gro_list = skb;
3486         ret = GRO_HELD;
3487
3488 pull:
3489         if (skb_headlen(skb) < skb_gro_offset(skb)) {
3490                 int grow = skb_gro_offset(skb) - skb_headlen(skb);
3491
3492                 BUG_ON(skb->end - skb->tail < grow);
3493
3494                 memcpy(skb_tail_pointer(skb), NAPI_GRO_CB(skb)->frag0, grow);
3495
3496                 skb->tail += grow;
3497                 skb->data_len -= grow;
3498
3499                 skb_shinfo(skb)->frags[0].page_offset += grow;
3500                 skb_frag_size_sub(&skb_shinfo(skb)->frags[0], grow);
3501
3502                 if (unlikely(!skb_frag_size(&skb_shinfo(skb)->frags[0]))) {
3503                         skb_frag_unref(skb, 0);
3504                         memmove(skb_shinfo(skb)->frags,
3505                                 skb_shinfo(skb)->frags + 1,
3506                                 --skb_shinfo(skb)->nr_frags * sizeof(skb_frag_t));
3507                 }
3508         }
3509
3510 ok:
3511         return ret;
3512
3513 normal:
3514         ret = GRO_NORMAL;
3515         goto pull;
3516 }
3517 EXPORT_SYMBOL(dev_gro_receive);
3518
3519 static inline gro_result_t
3520 __napi_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
3521 {
3522         struct sk_buff *p;
3523         unsigned int maclen = skb->dev->hard_header_len;
3524
3525         for (p = napi->gro_list; p; p = p->next) {
3526                 unsigned long diffs;
3527
3528                 diffs = (unsigned long)p->dev ^ (unsigned long)skb->dev;
3529                 diffs |= p->vlan_tci ^ skb->vlan_tci;
3530                 if (maclen == ETH_HLEN)
3531                         diffs |= compare_ether_header(skb_mac_header(p),
3532                                                       skb_gro_mac_header(skb));
3533                 else if (!diffs)
3534                         diffs = memcmp(skb_mac_header(p),
3535                                        skb_gro_mac_header(skb),
3536                                        maclen);
3537                 NAPI_GRO_CB(p)->same_flow = !diffs;
3538                 NAPI_GRO_CB(p)->flush = 0;
3539         }
3540
3541         return dev_gro_receive(napi, skb);
3542 }
3543
3544 gro_result_t napi_skb_finish(gro_result_t ret, struct sk_buff *skb)
3545 {
3546         switch (ret) {
3547         case GRO_NORMAL:
3548                 if (netif_receive_skb(skb))
3549                         ret = GRO_DROP;
3550                 break;
3551
3552         case GRO_DROP:
3553         case GRO_MERGED_FREE:
3554                 kfree_skb(skb);
3555                 break;
3556
3557         case GRO_HELD:
3558         case GRO_MERGED:
3559                 break;
3560         }
3561
3562         return ret;
3563 }
3564 EXPORT_SYMBOL(napi_skb_finish);
3565
3566 void skb_gro_reset_offset(struct sk_buff *skb)
3567 {
3568         NAPI_GRO_CB(skb)->data_offset = 0;
3569         NAPI_GRO_CB(skb)->frag0 = NULL;
3570         NAPI_GRO_CB(skb)->frag0_len = 0;
3571
3572         if (skb->mac_header == skb->tail &&
3573             !PageHighMem(skb_frag_page(&skb_shinfo(skb)->frags[0]))) {
3574                 NAPI_GRO_CB(skb)->frag0 =
3575                         skb_frag_address(&skb_shinfo(skb)->frags[0]);
3576                 NAPI_GRO_CB(skb)->frag0_len = skb_frag_size(&skb_shinfo(skb)->frags[0]);
3577         }
3578 }
3579 EXPORT_SYMBOL(skb_gro_reset_offset);
3580
3581 gro_result_t napi_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
3582 {
3583         skb_gro_reset_offset(skb);
3584
3585         return napi_skb_finish(__napi_gro_receive(napi, skb), skb);
3586 }
3587 EXPORT_SYMBOL(napi_gro_receive);
3588
3589 static void napi_reuse_skb(struct napi_struct *napi, struct sk_buff *skb)
3590 {
3591         __skb_pull(skb, skb_headlen(skb));
3592         /* restore the reserve we had after netdev_alloc_skb_ip_align() */
3593         skb_reserve(skb, NET_SKB_PAD + NET_IP_ALIGN - skb_headroom(skb));
3594         skb->vlan_tci = 0;
3595         skb->dev = napi->dev;
3596         skb->skb_iif = 0;
3597
3598         napi->skb = skb;
3599 }
3600
3601 struct sk_buff *napi_get_frags(struct napi_struct *napi)
3602 {
3603         struct sk_buff *skb = napi->skb;
3604
3605         if (!skb) {
3606                 skb = netdev_alloc_skb_ip_align(napi->dev, GRO_MAX_HEAD);
3607                 if (skb)
3608                         napi->skb = skb;
3609         }
3610         return skb;
3611 }
3612 EXPORT_SYMBOL(napi_get_frags);
3613
3614 gro_result_t napi_frags_finish(struct napi_struct *napi, struct sk_buff *skb,
3615                                gro_result_t ret)
3616 {
3617         switch (ret) {
3618         case GRO_NORMAL:
3619         case GRO_HELD:
3620                 skb->protocol = eth_type_trans(skb, skb->dev);
3621
3622                 if (ret == GRO_HELD)
3623                         skb_gro_pull(skb, -ETH_HLEN);
3624                 else if (netif_receive_skb(skb))
3625                         ret = GRO_DROP;
3626                 break;
3627
3628         case GRO_DROP:
3629         case GRO_MERGED_FREE:
3630                 napi_reuse_skb(napi, skb);
3631                 break;
3632
3633         case GRO_MERGED:
3634                 break;
3635         }
3636
3637         return ret;
3638 }
3639 EXPORT_SYMBOL(napi_frags_finish);
3640
3641 struct sk_buff *napi_frags_skb(struct napi_struct *napi)
3642 {
3643         struct sk_buff *skb = napi->skb;
3644         struct ethhdr *eth;
3645         unsigned int hlen;
3646         unsigned int off;
3647
3648         napi->skb = NULL;
3649
3650         skb_reset_mac_header(skb);
3651         skb_gro_reset_offset(skb);
3652
3653         off = skb_gro_offset(skb);
3654         hlen = off + sizeof(*eth);
3655         eth = skb_gro_header_fast(skb, off);
3656         if (skb_gro_header_hard(skb, hlen)) {
3657                 eth = skb_gro_header_slow(skb, hlen, off);
3658                 if (unlikely(!eth)) {
3659                         napi_reuse_skb(napi, skb);
3660                         skb = NULL;
3661                         goto out;
3662                 }
3663         }
3664
3665         skb_gro_pull(skb, sizeof(*eth));
3666
3667         /*
3668          * This works because the only protocols we care about don't require
3669          * special handling.  We'll fix it up properly at the end.
3670          */
3671         skb->protocol = eth->h_proto;
3672
3673 out:
3674         return skb;
3675 }
3676 EXPORT_SYMBOL(napi_frags_skb);
3677
3678 gro_result_t napi_gro_frags(struct napi_struct *napi)
3679 {
3680         struct sk_buff *skb = napi_frags_skb(napi);
3681
3682         if (!skb)
3683                 return GRO_DROP;
3684
3685         return napi_frags_finish(napi, skb, __napi_gro_receive(napi, skb));
3686 }
3687 EXPORT_SYMBOL(napi_gro_frags);
3688
3689 /*
3690  * net_rps_action sends any pending IPI's for rps.
3691  * Note: called with local irq disabled, but exits with local irq enabled.
3692  */
3693 static void net_rps_action_and_irq_enable(struct softnet_data *sd)
3694 {
3695 #ifdef CONFIG_RPS
3696         struct softnet_data *remsd = sd->rps_ipi_list;
3697
3698         if (remsd) {
3699                 sd->rps_ipi_list = NULL;
3700
3701                 local_irq_enable();
3702
3703                 /* Send pending IPI's to kick RPS processing on remote cpus. */
3704                 while (remsd) {
3705                         struct softnet_data *next = remsd->rps_ipi_next;
3706
3707                         if (cpu_online(remsd->cpu))
3708                                 __smp_call_function_single(remsd->cpu,
3709                                                            &remsd->csd, 0);
3710                         remsd = next;
3711                 }
3712         } else
3713 #endif
3714                 local_irq_enable();
3715 }
3716
3717 static int process_backlog(struct napi_struct *napi, int quota)
3718 {
3719         int work = 0;
3720         struct softnet_data *sd = container_of(napi, struct softnet_data, backlog);
3721
3722 #ifdef CONFIG_RPS
3723         /* Check if we have pending ipi, its better to send them now,
3724          * not waiting net_rx_action() end.
3725          */
3726         if (sd->rps_ipi_list) {
3727                 local_irq_disable();
3728                 net_rps_action_and_irq_enable(sd);
3729         }
3730 #endif
3731         napi->weight = weight_p;
3732         local_irq_disable();
3733         while (work < quota) {
3734                 struct sk_buff *skb;
3735                 unsigned int qlen;
3736
3737                 while ((skb = __skb_dequeue(&sd->process_queue))) {
3738                         local_irq_enable();
3739                         __netif_receive_skb(skb);
3740                         local_irq_disable();
3741                         input_queue_head_incr(sd);
3742                         if (++work >= quota) {
3743                                 local_irq_enable();
3744                                 return work;
3745                         }
3746                 }
3747
3748                 rps_lock(sd);
3749                 qlen = skb_queue_len(&sd->input_pkt_queue);
3750                 if (qlen)
3751                         skb_queue_splice_tail_init(&sd->input_pkt_queue,
3752                                                    &sd->process_queue);
3753
3754                 if (qlen < quota - work) {
3755                         /*
3756                          * Inline a custom version of __napi_complete().
3757                          * only current cpu owns and manipulates this napi,
3758                          * and NAPI_STATE_SCHED is the only possible flag set on backlog.
3759                          * we can use a plain write instead of clear_bit(),
3760                          * and we dont need an smp_mb() memory barrier.
3761                          */
3762                         list_del(&napi->poll_list);
3763                         napi->state = 0;
3764
3765                         quota = work + qlen;
3766                 }
3767                 rps_unlock(sd);
3768         }
3769         local_irq_enable();
3770
3771         return work;
3772 }
3773
3774 /**
3775  * __napi_schedule - schedule for receive
3776  * @n: entry to schedule
3777  *
3778  * The entry's receive function will be scheduled to run
3779  */
3780 void __napi_schedule(struct napi_struct *n)
3781 {
3782         unsigned long flags;
3783
3784         local_irq_save(flags);
3785         ____napi_schedule(&__get_cpu_var(softnet_data), n);
3786         local_irq_restore(flags);
3787 }
3788 EXPORT_SYMBOL(__napi_schedule);
3789
3790 void __napi_complete(struct napi_struct *n)
3791 {
3792         BUG_ON(!test_bit(NAPI_STATE_SCHED, &n->state));
3793         BUG_ON(n->gro_list);
3794
3795         list_del(&n->poll_list);
3796         smp_mb__before_clear_bit();
3797         clear_bit(NAPI_STATE_SCHED, &n->state);
3798 }
3799 EXPORT_SYMBOL(__napi_complete);
3800
3801 void napi_complete(struct napi_struct *n)
3802 {
3803         unsigned long flags;
3804
3805         /*
3806          * don't let napi dequeue from the cpu poll list
3807          * just in case its running on a different cpu
3808          */
3809         if (unlikely(test_bit(NAPI_STATE_NPSVC, &n->state)))
3810                 return;
3811
3812         napi_gro_flush(n);
3813         local_irq_save(flags);
3814         __napi_complete(n);
3815         local_irq_restore(flags);
3816 }
3817 EXPORT_SYMBOL(napi_complete);
3818
3819 void netif_napi_add(struct net_device *dev, struct napi_struct *napi,
3820                     int (*poll)(struct napi_struct *, int), int weight)
3821 {
3822         INIT_LIST_HEAD(&napi->poll_list);
3823         napi->gro_count = 0;
3824         napi->gro_list = NULL;
3825         napi->skb = NULL;
3826         napi->poll = poll;
3827         napi->weight = weight;
3828         list_add(&napi->dev_list, &dev->napi_list);
3829         napi->dev = dev;
3830 #ifdef CONFIG_NETPOLL
3831         spin_lock_init(&napi->poll_lock);
3832         napi->poll_owner = -1;
3833 #endif
3834         set_bit(NAPI_STATE_SCHED, &napi->state);
3835 }
3836 EXPORT_SYMBOL(netif_napi_add);
3837
3838 void netif_napi_del(struct napi_struct *napi)
3839 {
3840         struct sk_buff *skb, *next;
3841
3842         list_del_init(&napi->dev_list);
3843         napi_free_frags(napi);
3844
3845         for (skb = napi->gro_list; skb; skb = next) {
3846                 next = skb->next;
3847                 skb->next = NULL;
3848                 kfree_skb(skb);
3849         }
3850
3851         napi->gro_list = NULL;
3852         napi->gro_count = 0;
3853 }
3854 EXPORT_SYMBOL(netif_napi_del);
3855
3856 static void net_rx_action(struct softirq_action *h)
3857 {
3858         struct softnet_data *sd = &__get_cpu_var(softnet_data);
3859         unsigned long time_limit = jiffies + 2;
3860         int budget = netdev_budget;
3861         void *have;
3862
3863         local_irq_disable();
3864
3865         while (!list_empty(&sd->poll_list)) {
3866                 struct napi_struct *n;
3867                 int work, weight;
3868
3869                 /* If softirq window is exhuasted then punt.
3870                  * Allow this to run for 2 jiffies since which will allow
3871                  * an average latency of 1.5/HZ.
3872                  */
3873                 if (unlikely(budget <= 0 || time_after(jiffies, time_limit)))
3874                         goto softnet_break;
3875
3876                 local_irq_enable();
3877
3878                 /* Even though interrupts have been re-enabled, this
3879                  * access is safe because interrupts can only add new
3880                  * entries to the tail of this list, and only ->poll()
3881                  * calls can remove this head entry from the list.
3882                  */
3883                 n = list_first_entry(&sd->poll_list, struct napi_struct, poll_list);
3884
3885                 have = netpoll_poll_lock(n);
3886
3887                 weight = n->weight;
3888
3889                 /* This NAPI_STATE_SCHED test is for avoiding a race
3890                  * with netpoll's poll_napi().  Only the entity which
3891                  * obtains the lock and sees NAPI_STATE_SCHED set will
3892                  * actually make the ->poll() call.  Therefore we avoid
3893                  * accidentally calling ->poll() when NAPI is not scheduled.
3894                  */
3895                 work = 0;
3896                 if (test_bit(NAPI_STATE_SCHED, &n->state)) {
3897                         work = n->poll(n, weight);
3898                         trace_napi_poll(n);
3899                 }
3900
3901                 WARN_ON_ONCE(work > weight);
3902
3903                 budget -= work;
3904
3905                 local_irq_disable();
3906
3907                 /* Drivers must not modify the NAPI state if they
3908                  * consume the entire weight.  In such cases this code
3909                  * still "owns" the NAPI instance and therefore can
3910                  * move the instance around on the list at-will.
3911                  */
3912                 if (unlikely(work == weight)) {
3913                         if (unlikely(napi_disable_pending(n))) {
3914                                 local_irq_enable();
3915                                 napi_complete(n);
3916                                 local_irq_disable();
3917                         } else
3918                                 list_move_tail(&n->poll_list, &sd->poll_list);
3919                 }
3920
3921                 netpoll_poll_unlock(have);
3922         }
3923 out:
3924         net_rps_action_and_irq_enable(sd);
3925
3926 #ifdef CONFIG_NET_DMA
3927         /*
3928          * There may not be any more sk_buffs coming right now, so push
3929          * any pending DMA copies to hardware
3930          */
3931         dma_issue_pending_all();
3932 #endif
3933
3934         return;
3935
3936 softnet_break:
3937         sd->time_squeeze++;
3938         __raise_softirq_irqoff(NET_RX_SOFTIRQ);
3939         goto out;
3940 }
3941
3942 static gifconf_func_t *gifconf_list[NPROTO];
3943
3944 /**
3945  *      register_gifconf        -       register a SIOCGIF handler
3946  *      @family: Address family
3947  *      @gifconf: Function handler
3948  *
3949  *      Register protocol dependent address dumping routines. The handler
3950  *      that is passed must not be freed or reused until it has been replaced
3951  *      by another handler.
3952  */
3953 int register_gifconf(unsigned int family, gifconf_func_t *gifconf)
3954 {
3955         if (family >= NPROTO)
3956                 return -EINVAL;
3957         gifconf_list[family] = gifconf;
3958         return 0;
3959 }
3960 EXPORT_SYMBOL(register_gifconf);
3961
3962
3963 /*
3964  *      Map an interface index to its name (SIOCGIFNAME)
3965  */
3966
3967 /*
3968  *      We need this ioctl for efficient implementation of the
3969  *      if_indextoname() function required by the IPv6 API.  Without
3970  *      it, we would have to search all the interfaces to find a
3971  *      match.  --pb
3972  */
3973
3974 static int dev_ifname(struct net *net, struct ifreq __user *arg)
3975 {
3976         struct net_device *dev;
3977         struct ifreq ifr;
3978
3979         /*
3980          *      Fetch the caller's info block.
3981          */
3982
3983         if (copy_from_user(&ifr, arg, sizeof(struct ifreq)))
3984                 return -EFAULT;
3985
3986         rcu_read_lock();
3987         dev = dev_get_by_index_rcu(net, ifr.ifr_ifindex);
3988         if (!dev) {
3989                 rcu_read_unlock();
3990                 return -ENODEV;
3991         }
3992
3993         strcpy(ifr.ifr_name, dev->name);
3994         rcu_read_unlock();
3995
3996         if (copy_to_user(arg, &ifr, sizeof(struct ifreq)))
3997                 return -EFAULT;
3998         return 0;
3999 }
4000
4001 /*
4002  *      Perform a SIOCGIFCONF call. This structure will change
4003  *      size eventually, and there is nothing I can do about it.
4004  *      Thus we will need a 'compatibility mode'.
4005  */
4006
4007 static int dev_ifconf(struct net *net, char __user *arg)
4008 {
4009         struct ifconf ifc;
4010         struct net_device *dev;
4011         char __user *pos;
4012         int len;
4013         int total;
4014         int i;
4015
4016         /*
4017          *      Fetch the caller's info block.
4018          */
4019
4020         if (copy_from_user(&ifc, arg, sizeof(struct ifconf)))
4021                 return -EFAULT;
4022
4023         pos = ifc.ifc_buf;
4024         len = ifc.ifc_len;
4025
4026         /*
4027          *      Loop over the interfaces, and write an info block for each.
4028          */
4029
4030         total = 0;
4031         for_each_netdev(net, dev) {
4032                 for (i = 0; i < NPROTO; i++) {
4033                         if (gifconf_list[i]) {
4034                                 int done;
4035                                 if (!pos)
4036                                         done = gifconf_list[i](dev, NULL, 0);
4037                                 else
4038                                         done = gifconf_list[i](dev, pos + total,
4039                                                                len - total);
4040                                 if (done < 0)
4041                                         return -EFAULT;
4042                                 total += done;
4043                         }
4044                 }
4045         }
4046
4047         /*
4048          *      All done.  Write the updated control block back to the caller.
4049          */
4050         ifc.ifc_len = total;
4051
4052         /*
4053          *      Both BSD and Solaris return 0 here, so we do too.
4054          */
4055         return copy_to_user(arg, &ifc, sizeof(struct ifconf)) ? -EFAULT : 0;
4056 }
4057
4058 #ifdef CONFIG_PROC_FS
4059
4060 #define BUCKET_SPACE (32 - NETDEV_HASHBITS - 1)
4061
4062 #define get_bucket(x) ((x) >> BUCKET_SPACE)
4063 #define get_offset(x) ((x) & ((1 << BUCKET_SPACE) - 1))
4064 #define set_bucket_offset(b, o) ((b) << BUCKET_SPACE | (o))
4065
4066 static inline struct net_device *dev_from_same_bucket(struct seq_file *seq, loff_t *pos)
4067 {
4068         struct net *net = seq_file_net(seq);
4069         struct net_device *dev;
4070         struct hlist_node *p;
4071         struct hlist_head *h;
4072         unsigned int count = 0, offset = get_offset(*pos);
4073
4074         h = &net->dev_name_head[get_bucket(*pos)];
4075         hlist_for_each_entry_rcu(dev, p, h, name_hlist) {
4076                 if (++count == offset)
4077                         return dev;
4078         }
4079
4080         return NULL;
4081 }
4082
4083 static inline struct net_device *dev_from_bucket(struct seq_file *seq, loff_t *pos)
4084 {
4085         struct net_device *dev;
4086         unsigned int bucket;
4087
4088         do {
4089                 dev = dev_from_same_bucket(seq, pos);
4090                 if (dev)
4091                         return dev;
4092
4093                 bucket = get_bucket(*pos) + 1;
4094                 *pos = set_bucket_offset(bucket, 1);
4095         } while (bucket < NETDEV_HASHENTRIES);
4096
4097         return NULL;
4098 }
4099
4100 /*
4101  *      This is invoked by the /proc filesystem handler to display a device
4102  *      in detail.
4103  */
4104 void *dev_seq_start(struct seq_file *seq, loff_t *pos)
4105         __acquires(RCU)
4106 {
4107         rcu_read_lock();
4108         if (!*pos)
4109                 return SEQ_START_TOKEN;
4110
4111         if (get_bucket(*pos) >= NETDEV_HASHENTRIES)
4112                 return NULL;
4113
4114         return dev_from_bucket(seq, pos);
4115 }
4116
4117 void *dev_seq_next(struct seq_file *seq, void *v, loff_t *pos)
4118 {
4119         ++*pos;
4120         return dev_from_bucket(seq, pos);
4121 }
4122
4123 void dev_seq_stop(struct seq_file *seq, void *v)
4124         __releases(RCU)
4125 {
4126         rcu_read_unlock();
4127 }
4128
4129 static void dev_seq_printf_stats(struct seq_file *seq, struct net_device *dev)
4130 {
4131         struct rtnl_link_stats64 temp;
4132         const struct rtnl_link_stats64 *stats = dev_get_stats(dev, &temp);
4133
4134         seq_printf(seq, "%6s: %7llu %7llu %4llu %4llu %4llu %5llu %10llu %9llu "
4135                    "%8llu %7llu %4llu %4llu %4llu %5llu %7llu %10llu\n",
4136                    dev->name, stats->rx_bytes, stats->rx_packets,
4137                    stats->rx_errors,
4138                    stats->rx_dropped + stats->rx_missed_errors,
4139                    stats->rx_fifo_errors,
4140                    stats->rx_length_errors + stats->rx_over_errors +
4141                     stats->rx_crc_errors + stats->rx_frame_errors,
4142                    stats->rx_compressed, stats->multicast,
4143                    stats->tx_bytes, stats->tx_packets,
4144                    stats->tx_errors, stats->tx_dropped,
4145                    stats->tx_fifo_errors, stats->collisions,
4146                    stats->tx_carrier_errors +
4147                     stats->tx_aborted_errors +
4148                     stats->tx_window_errors +
4149                     stats->tx_heartbeat_errors,
4150                    stats->tx_compressed);
4151 }
4152
4153 /*
4154  *      Called from the PROCfs module. This now uses the new arbitrary sized
4155  *      /proc/net interface to create /proc/net/dev
4156  */
4157 static int dev_seq_show(struct seq_file *seq, void *v)
4158 {
4159         if (v == SEQ_START_TOKEN)
4160                 seq_puts(seq, "Inter-|   Receive                            "
4161                               "                    |  Transmit\n"
4162                               " face |bytes    packets errs drop fifo frame "
4163                               "compressed multicast|bytes    packets errs "
4164                               "drop fifo colls carrier compressed\n");
4165         else
4166                 dev_seq_printf_stats(seq, v);
4167         return 0;
4168 }
4169
4170 static struct softnet_data *softnet_get_online(loff_t *pos)
4171 {
4172         struct softnet_data *sd = NULL;
4173
4174         while (*pos < nr_cpu_ids)
4175                 if (cpu_online(*pos)) {
4176                         sd = &per_cpu(softnet_data, *pos);
4177                         break;
4178                 } else
4179                         ++*pos;
4180         return sd;
4181 }
4182
4183 static void *softnet_seq_start(struct seq_file *seq, loff_t *pos)
4184 {
4185         return softnet_get_online(pos);
4186 }
4187
4188 static void *softnet_seq_next(struct seq_file *seq, void *v, loff_t *pos)
4189 {
4190         ++*pos;
4191         return softnet_get_online(pos);
4192 }
4193
4194 static void softnet_seq_stop(struct seq_file *seq, void *v)
4195 {
4196 }
4197
4198 static int softnet_seq_show(struct seq_file *seq, void *v)
4199 {
4200         struct softnet_data *sd = v;
4201
4202         seq_printf(seq, "%08x %08x %08x %08x %08x %08x %08x %08x %08x %08x\n",
4203                    sd->processed, sd->dropped, sd->time_squeeze, 0,
4204                    0, 0, 0, 0, /* was fastroute */
4205                    sd->cpu_collision, sd->received_rps);
4206         return 0;
4207 }
4208
4209 static const struct seq_operations dev_seq_ops = {
4210         .start = dev_seq_start,
4211         .next  = dev_seq_next,
4212         .stop  = dev_seq_stop,
4213         .show  = dev_seq_show,
4214 };
4215
4216 static int dev_seq_open(struct inode *inode, struct file *file)
4217 {
4218         return seq_open_net(inode, file, &dev_seq_ops,
4219                             sizeof(struct seq_net_private));
4220 }
4221
4222 static const struct file_operations dev_seq_fops = {
4223         .owner   = THIS_MODULE,
4224         .open    = dev_seq_open,
4225         .read    = seq_read,
4226         .llseek  = seq_lseek,
4227         .release = seq_release_net,
4228 };
4229
4230 static const struct seq_operations softnet_seq_ops = {
4231         .start = softnet_seq_start,
4232         .next  = softnet_seq_next,
4233         .stop  = softnet_seq_stop,
4234         .show  = softnet_seq_show,
4235 };
4236
4237 static int softnet_seq_open(struct inode *inode, struct file *file)
4238 {
4239         return seq_open(file, &softnet_seq_ops);
4240 }
4241
4242 static const struct file_operations softnet_seq_fops = {
4243         .owner   = THIS_MODULE,
4244         .open    = softnet_seq_open,
4245         .read    = seq_read,
4246         .llseek  = seq_lseek,
4247         .release = seq_release,
4248 };
4249
4250 static void *ptype_get_idx(loff_t pos)
4251 {
4252         struct packet_type *pt = NULL;
4253         loff_t i = 0;
4254         int t;
4255
4256         list_for_each_entry_rcu(pt, &ptype_all, list) {
4257                 if (i == pos)
4258                         return pt;
4259                 ++i;
4260         }
4261
4262         for (t = 0; t < PTYPE_HASH_SIZE; t++) {
4263                 list_for_each_entry_rcu(pt, &ptype_base[t], list) {
4264                         if (i == pos)
4265                                 return pt;
4266                         ++i;
4267                 }
4268         }
4269         return NULL;
4270 }
4271
4272 static void *ptype_seq_start(struct seq_file *seq, loff_t *pos)
4273         __acquires(RCU)
4274 {
4275         rcu_read_lock();
4276         return *pos ? ptype_get_idx(*pos - 1) : SEQ_START_TOKEN;
4277 }
4278
4279 static void *ptype_seq_next(struct seq_file *seq, void *v, loff_t *pos)
4280 {
4281         struct packet_type *pt;
4282         struct list_head *nxt;
4283         int hash;
4284
4285         ++*pos;
4286         if (v == SEQ_START_TOKEN)
4287                 return ptype_get_idx(0);
4288
4289         pt = v;
4290         nxt = pt->list.next;
4291         if (pt->type == htons(ETH_P_ALL)) {
4292                 if (nxt != &ptype_all)
4293                         goto found;
4294                 hash = 0;
4295                 nxt = ptype_base[0].next;
4296         } else
4297                 hash = ntohs(pt->type) & PTYPE_HASH_MASK;
4298
4299         while (nxt == &ptype_base[hash]) {
4300                 if (++hash >= PTYPE_HASH_SIZE)
4301                         return NULL;
4302                 nxt = ptype_base[hash].next;
4303         }
4304 found:
4305         return list_entry(nxt, struct packet_type, list);
4306 }
4307
4308 static void ptype_seq_stop(struct seq_file *seq, void *v)
4309         __releases(RCU)
4310 {
4311         rcu_read_unlock();
4312 }
4313
4314 static int ptype_seq_show(struct seq_file *seq, void *v)
4315 {
4316         struct packet_type *pt = v;
4317
4318         if (v == SEQ_START_TOKEN)
4319                 seq_puts(seq, "Type Device      Function\n");
4320         else if (pt->dev == NULL || dev_net(pt->dev) == seq_file_net(seq)) {
4321                 if (pt->type == htons(ETH_P_ALL))
4322                         seq_puts(seq, "ALL ");
4323                 else
4324                         seq_printf(seq, "%04x", ntohs(pt->type));
4325
4326                 seq_printf(seq, " %-8s %pF\n",
4327                            pt->dev ? pt->dev->name : "", pt->func);
4328         }
4329
4330         return 0;
4331 }
4332
4333 static const struct seq_operations ptype_seq_ops = {
4334         .start = ptype_seq_start,
4335         .next  = ptype_seq_next,
4336         .stop  = ptype_seq_stop,
4337         .show  = ptype_seq_show,
4338 };
4339
4340 static int ptype_seq_open(struct inode *inode, struct file *file)
4341 {
4342         return seq_open_net(inode, file, &ptype_seq_ops,
4343                         sizeof(struct seq_net_private));
4344 }
4345
4346 static const struct file_operations ptype_seq_fops = {
4347         .owner   = THIS_MODULE,
4348         .open    = ptype_seq_open,
4349         .read    = seq_read,
4350         .llseek  = seq_lseek,
4351         .release = seq_release_net,
4352 };
4353
4354
4355 static int __net_init dev_proc_net_init(struct net *net)
4356 {
4357         int rc = -ENOMEM;
4358
4359         if (!proc_net_fops_create(net, "dev", S_IRUGO, &dev_seq_fops))
4360                 goto out;
4361         if (!proc_net_fops_create(net, "softnet_stat", S_IRUGO, &softnet_seq_fops))
4362                 goto out_dev;
4363         if (!proc_net_fops_create(net, "ptype", S_IRUGO, &ptype_seq_fops))
4364                 goto out_softnet;
4365
4366         if (wext_proc_init(net))
4367                 goto out_ptype;
4368         rc = 0;
4369 out:
4370         return rc;
4371 out_ptype:
4372         proc_net_remove(net, "ptype");
4373 out_softnet:
4374         proc_net_remove(net, "softnet_stat");
4375 out_dev:
4376         proc_net_remove(net, "dev");
4377         goto out;
4378 }
4379
4380 static void __net_exit dev_proc_net_exit(struct net *net)
4381 {
4382         wext_proc_exit(net);
4383
4384         proc_net_remove(net, "ptype");
4385         proc_net_remove(net, "softnet_stat");
4386         proc_net_remove(net, "dev");
4387 }
4388
4389 static struct pernet_operations __net_initdata dev_proc_ops = {
4390         .init = dev_proc_net_init,
4391         .exit = dev_proc_net_exit,
4392 };
4393
4394 static int __init dev_proc_init(void)
4395 {
4396         return register_pernet_subsys(&dev_proc_ops);
4397 }
4398 #else
4399 #define dev_proc_init() 0
4400 #endif  /* CONFIG_PROC_FS */
4401
4402
4403 /**
4404  *      netdev_set_master       -       set up master pointer
4405  *      @slave: slave device
4406  *      @master: new master device
4407  *
4408  *      Changes the master device of the slave. Pass %NULL to break the
4409  *      bonding. The caller must hold the RTNL semaphore. On a failure
4410  *      a negative errno code is returned. On success the reference counts
4411  *      are adjusted and the function returns zero.
4412  */
4413 int netdev_set_master(struct net_device *slave, struct net_device *master)
4414 {
4415         struct net_device *old = slave->master;
4416
4417         ASSERT_RTNL();
4418
4419         if (master) {
4420                 if (old)
4421                         return -EBUSY;
4422                 dev_hold(master);
4423         }
4424
4425         slave->master = master;
4426
4427         if (old)
4428                 dev_put(old);
4429         return 0;
4430 }
4431 EXPORT_SYMBOL(netdev_set_master);
4432
4433 /**
4434  *      netdev_set_bond_master  -       set up bonding master/slave pair
4435  *      @slave: slave device
4436  *      @master: new master device
4437  *
4438  *      Changes the master device of the slave. Pass %NULL to break the
4439  *      bonding. The caller must hold the RTNL semaphore. On a failure
4440  *      a negative errno code is returned. On success %RTM_NEWLINK is sent
4441  *      to the routing socket and the function returns zero.
4442  */
4443 int netdev_set_bond_master(struct net_device *slave, struct net_device *master)
4444 {
4445         int err;
4446
4447         ASSERT_RTNL();
4448
4449         err = netdev_set_master(slave, master);
4450         if (err)
4451                 return err;
4452         if (master)
4453                 slave->flags |= IFF_SLAVE;
4454         else
4455                 slave->flags &= ~IFF_SLAVE;
4456
4457         rtmsg_ifinfo(RTM_NEWLINK, slave, IFF_SLAVE);
4458         return 0;
4459 }
4460 EXPORT_SYMBOL(netdev_set_bond_master);
4461
4462 static void dev_change_rx_flags(struct net_device *dev, int flags)
4463 {
4464         const struct net_device_ops *ops = dev->netdev_ops;
4465
4466         if ((dev->flags & IFF_UP) && ops->ndo_change_rx_flags)
4467                 ops->ndo_change_rx_flags(dev, flags);
4468 }
4469
4470 static int __dev_set_promiscuity(struct net_device *dev, int inc)
4471 {
4472         unsigned int old_flags = dev->flags;
4473         uid_t uid;
4474         gid_t gid;
4475
4476         ASSERT_RTNL();
4477
4478         dev->flags |= IFF_PROMISC;
4479         dev->promiscuity += inc;
4480         if (dev->promiscuity == 0) {
4481                 /*
4482                  * Avoid overflow.
4483                  * If inc causes overflow, untouch promisc and return error.
4484                  */
4485                 if (inc < 0)
4486                         dev->flags &= ~IFF_PROMISC;
4487                 else {
4488                         dev->promiscuity -= inc;
4489                         printk(KERN_WARNING "%s: promiscuity touches roof, "
4490                                 "set promiscuity failed, promiscuity feature "
4491                                 "of device might be broken.\n", dev->name);
4492                         return -EOVERFLOW;
4493                 }
4494         }
4495         if (dev->flags != old_flags) {
4496                 printk(KERN_INFO "device %s %s promiscuous mode\n",
4497                        dev->name, (dev->flags & IFF_PROMISC) ? "entered" :
4498                                                                "left");
4499                 if (audit_enabled) {
4500                         current_uid_gid(&uid, &gid);
4501                         audit_log(current->audit_context, GFP_ATOMIC,
4502                                 AUDIT_ANOM_PROMISCUOUS,
4503                                 "dev=%s prom=%d old_prom=%d auid=%u uid=%u gid=%u ses=%u",
4504                                 dev->name, (dev->flags & IFF_PROMISC),
4505                                 (old_flags & IFF_PROMISC),
4506                                 audit_get_loginuid(current),
4507                                 uid, gid,
4508                                 audit_get_sessionid(current));
4509                 }
4510
4511                 dev_change_rx_flags(dev, IFF_PROMISC);
4512         }
4513         return 0;
4514 }
4515
4516 /**
4517  *      dev_set_promiscuity     - update promiscuity count on a device
4518  *      @dev: device
4519  *      @inc: modifier
4520  *
4521  *      Add or remove promiscuity from a device. While the count in the device
4522  *      remains above zero the interface remains promiscuous. Once it hits zero
4523  *      the device reverts back to normal filtering operation. A negative inc
4524  *      value is used to drop promiscuity on the device.
4525  *      Return 0 if successful or a negative errno code on error.
4526  */
4527 int dev_set_promiscuity(struct net_device *dev, int inc)
4528 {
4529         unsigned int old_flags = dev->flags;
4530         int err;
4531
4532         err = __dev_set_promiscuity(dev, inc);
4533         if (err < 0)
4534                 return err;
4535         if (dev->flags != old_flags)
4536                 dev_set_rx_mode(dev);
4537         return err;
4538 }
4539 EXPORT_SYMBOL(dev_set_promiscuity);
4540
4541 /**
4542  *      dev_set_allmulti        - update allmulti count on a device
4543  *      @dev: device
4544  *      @inc: modifier
4545  *
4546  *      Add or remove reception of all multicast frames to a device. While the
4547  *      count in the device remains above zero the interface remains listening
4548  *      to all interfaces. Once it hits zero the device reverts back to normal
4549  *      filtering operation. A negative @inc value is used to drop the counter
4550  *      when releasing a resource needing all multicasts.
4551  *      Return 0 if successful or a negative errno code on error.
4552  */
4553
4554 int dev_set_allmulti(struct net_device *dev, int inc)
4555 {
4556         unsigned int old_flags = dev->flags;
4557
4558         ASSERT_RTNL();
4559
4560         dev->flags |= IFF_ALLMULTI;
4561         dev->allmulti += inc;
4562         if (dev->allmulti == 0) {
4563                 /*
4564                  * Avoid overflow.
4565                  * If inc causes overflow, untouch allmulti and return error.
4566                  */
4567                 if (inc < 0)
4568                         dev->flags &= ~IFF_ALLMULTI;
4569                 else {
4570                         dev->allmulti -= inc;
4571                         printk(KERN_WARNING "%s: allmulti touches roof, "
4572                                 "set allmulti failed, allmulti feature of "
4573                                 "device might be broken.\n", dev->name);
4574                         return -EOVERFLOW;
4575                 }
4576         }
4577         if (dev->flags ^ old_flags) {
4578                 dev_change_rx_flags(dev, IFF_ALLMULTI);
4579                 dev_set_rx_mode(dev);
4580         }
4581         return 0;
4582 }
4583 EXPORT_SYMBOL(dev_set_allmulti);
4584
4585 /*
4586  *      Upload unicast and multicast address lists to device and
4587  *      configure RX filtering. When the device doesn't support unicast
4588  *      filtering it is put in promiscuous mode while unicast addresses
4589  *      are present.
4590  */
4591 void __dev_set_rx_mode(struct net_device *dev)
4592 {
4593         const struct net_device_ops *ops = dev->netdev_ops;
4594
4595         /* dev_open will call this function so the list will stay sane. */
4596         if (!(dev->flags&IFF_UP))
4597                 return;
4598
4599         if (!netif_device_present(dev))
4600                 return;
4601
4602         if (!(dev->priv_flags & IFF_UNICAST_FLT)) {
4603                 /* Unicast addresses changes may only happen under the rtnl,
4604                  * therefore calling __dev_set_promiscuity here is safe.
4605                  */
4606                 if (!netdev_uc_empty(dev) && !dev->uc_promisc) {
4607                         __dev_set_promiscuity(dev, 1);
4608                         dev->uc_promisc = true;
4609                 } else if (netdev_uc_empty(dev) && dev->uc_promisc) {
4610                         __dev_set_promiscuity(dev, -1);
4611                         dev->uc_promisc = false;
4612                 }
4613         }
4614
4615         if (ops->ndo_set_rx_mode)
4616                 ops->ndo_set_rx_mode(dev);
4617 }
4618
4619 void dev_set_rx_mode(struct net_device *dev)
4620 {
4621         netif_addr_lock_bh(dev);
4622         __dev_set_rx_mode(dev);
4623         netif_addr_unlock_bh(dev);
4624 }
4625
4626 /**
4627  *      dev_get_flags - get flags reported to userspace
4628  *      @dev: device
4629  *
4630  *      Get the combination of flag bits exported through APIs to userspace.
4631  */
4632 unsigned dev_get_flags(const struct net_device *dev)
4633 {
4634         unsigned flags;
4635
4636         flags = (dev->flags & ~(IFF_PROMISC |
4637                                 IFF_ALLMULTI |
4638                                 IFF_RUNNING |
4639                                 IFF_LOWER_UP |
4640                                 IFF_DORMANT)) |
4641                 (dev->gflags & (IFF_PROMISC |
4642                                 IFF_ALLMULTI));
4643
4644         if (netif_running(dev)) {
4645                 if (netif_oper_up(dev))
4646                         flags |= IFF_RUNNING;
4647                 if (netif_carrier_ok(dev))
4648                         flags |= IFF_LOWER_UP;
4649                 if (netif_dormant(dev))
4650                         flags |= IFF_DORMANT;
4651         }
4652
4653         return flags;
4654 }
4655 EXPORT_SYMBOL(dev_get_flags);
4656
4657 int __dev_change_flags(struct net_device *dev, unsigned int flags)
4658 {
4659         unsigned int old_flags = dev->flags;
4660         int ret;
4661
4662         ASSERT_RTNL();
4663
4664         /*
4665          *      Set the flags on our device.
4666          */
4667
4668         dev->flags = (flags & (IFF_DEBUG | IFF_NOTRAILERS | IFF_NOARP |
4669                                IFF_DYNAMIC | IFF_MULTICAST | IFF_PORTSEL |
4670                                IFF_AUTOMEDIA)) |
4671                      (dev->flags & (IFF_UP | IFF_VOLATILE | IFF_PROMISC |
4672                                     IFF_ALLMULTI));
4673
4674         /*
4675          *      Load in the correct multicast list now the flags have changed.
4676          */
4677
4678         if ((old_flags ^ flags) & IFF_MULTICAST)
4679                 dev_change_rx_flags(dev, IFF_MULTICAST);
4680
4681         dev_set_rx_mode(dev);
4682
4683         /*
4684          *      Have we downed the interface. We handle IFF_UP ourselves
4685          *      according to user attempts to set it, rather than blindly
4686          *      setting it.
4687          */
4688
4689         ret = 0;
4690         if ((old_flags ^ flags) & IFF_UP) {     /* Bit is different  ? */
4691                 ret = ((old_flags & IFF_UP) ? __dev_close : __dev_open)(dev);
4692
4693                 if (!ret)
4694                         dev_set_rx_mode(dev);
4695         }
4696
4697         if ((flags ^ dev->gflags) & IFF_PROMISC) {
4698                 int inc = (flags & IFF_PROMISC) ? 1 : -1;
4699
4700                 dev->gflags ^= IFF_PROMISC;
4701                 dev_set_promiscuity(dev, inc);
4702         }
4703
4704         /* NOTE: order of synchronization of IFF_PROMISC and IFF_ALLMULTI
4705            is important. Some (broken) drivers set IFF_PROMISC, when
4706            IFF_ALLMULTI is requested not asking us and not reporting.
4707          */
4708         if ((flags ^ dev->gflags) & IFF_ALLMULTI) {
4709                 int inc = (flags & IFF_ALLMULTI) ? 1 : -1;
4710
4711                 dev->gflags ^= IFF_ALLMULTI;
4712                 dev_set_allmulti(dev, inc);
4713         }
4714
4715         return ret;
4716 }
4717
4718 void __dev_notify_flags(struct net_device *dev, unsigned int old_flags)
4719 {
4720         unsigned int changes = dev->flags ^ old_flags;
4721
4722         if (changes & IFF_UP) {
4723                 if (dev->flags & IFF_UP)
4724                         call_netdevice_notifiers(NETDEV_UP, dev);
4725                 else
4726                         call_netdevice_notifiers(NETDEV_DOWN, dev);
4727         }
4728
4729         if (dev->flags & IFF_UP &&
4730             (changes & ~(IFF_UP | IFF_PROMISC | IFF_ALLMULTI | IFF_VOLATILE)))
4731                 call_netdevice_notifiers(NETDEV_CHANGE, dev);
4732 }
4733
4734 /**
4735  *      dev_change_flags - change device settings
4736  *      @dev: device
4737  *      @flags: device state flags
4738  *
4739  *      Change settings on device based state flags. The flags are
4740  *      in the userspace exported format.
4741  */
4742 int dev_change_flags(struct net_device *dev, unsigned int flags)
4743 {
4744         int ret;
4745         unsigned int changes, old_flags = dev->flags;
4746
4747         ret = __dev_change_flags(dev, flags);
4748         if (ret < 0)
4749                 return ret;
4750
4751         changes = old_flags ^ dev->flags;
4752         if (changes)
4753                 rtmsg_ifinfo(RTM_NEWLINK, dev, changes);
4754
4755         __dev_notify_flags(dev, old_flags);
4756         return ret;
4757 }
4758 EXPORT_SYMBOL(dev_change_flags);
4759
4760 /**
4761  *      dev_set_mtu - Change maximum transfer unit
4762  *      @dev: device
4763  *      @new_mtu: new transfer unit
4764  *
4765  *      Change the maximum transfer size of the network device.
4766  */
4767 int dev_set_mtu(struct net_device *dev, int new_mtu)
4768 {
4769         const struct net_device_ops *ops = dev->netdev_ops;
4770         int err;
4771
4772         if (new_mtu == dev->mtu)
4773                 return 0;
4774
4775         /*      MTU must be positive.    */
4776         if (new_mtu < 0)
4777                 return -EINVAL;
4778
4779         if (!netif_device_present(dev))
4780                 return -ENODEV;
4781
4782         err = 0;
4783         if (ops->ndo_change_mtu)
4784                 err = ops->ndo_change_mtu(dev, new_mtu);
4785         else
4786                 dev->mtu = new_mtu;
4787
4788         if (!err && dev->flags & IFF_UP)
4789                 call_netdevice_notifiers(NETDEV_CHANGEMTU, dev);
4790         return err;
4791 }
4792 EXPORT_SYMBOL(dev_set_mtu);
4793
4794 /**
4795  *      dev_set_group - Change group this device belongs to
4796  *      @dev: device
4797  *      @new_group: group this device should belong to
4798  */
4799 void dev_set_group(struct net_device *dev, int new_group)
4800 {
4801         dev->group = new_group;
4802 }
4803 EXPORT_SYMBOL(dev_set_group);
4804
4805 /**
4806  *      dev_set_mac_address - Change Media Access Control Address
4807  *      @dev: device
4808  *      @sa: new address
4809  *
4810  *      Change the hardware (MAC) address of the device
4811  */
4812 int dev_set_mac_address(struct net_device *dev, struct sockaddr *sa)
4813 {
4814         const struct net_device_ops *ops = dev->netdev_ops;
4815         int err;
4816
4817         if (!ops->ndo_set_mac_address)
4818                 return -EOPNOTSUPP;
4819         if (sa->sa_family != dev->type)
4820                 return -EINVAL;
4821         if (!netif_device_present(dev))
4822                 return -ENODEV;
4823         err = ops->ndo_set_mac_address(dev, sa);
4824         if (!err)
4825                 call_netdevice_notifiers(NETDEV_CHANGEADDR, dev);
4826         return err;
4827 }
4828 EXPORT_SYMBOL(dev_set_mac_address);
4829
4830 /*
4831  *      Perform the SIOCxIFxxx calls, inside rcu_read_lock()
4832  */
4833 static int dev_ifsioc_locked(struct net *net, struct ifreq *ifr, unsigned int cmd)
4834 {
4835         int err;
4836         struct net_device *dev = dev_get_by_name_rcu(net, ifr->ifr_name);
4837
4838         if (!dev)
4839                 return -ENODEV;
4840
4841         switch (cmd) {
4842         case SIOCGIFFLAGS:      /* Get interface flags */
4843                 ifr->ifr_flags = (short) dev_get_flags(dev);
4844                 return 0;
4845
4846         case SIOCGIFMETRIC:     /* Get the metric on the interface
4847                                    (currently unused) */
4848                 ifr->ifr_metric = 0;
4849                 return 0;
4850
4851         case SIOCGIFMTU:        /* Get the MTU of a device */
4852                 ifr->ifr_mtu = dev->mtu;
4853                 return 0;
4854
4855         case SIOCGIFHWADDR:
4856                 if (!dev->addr_len)
4857                         memset(ifr->ifr_hwaddr.sa_data, 0, sizeof ifr->ifr_hwaddr.sa_data);
4858                 else
4859                         memcpy(ifr->ifr_hwaddr.sa_data, dev->dev_addr,
4860                                min(sizeof ifr->ifr_hwaddr.sa_data, (size_t) dev->addr_len));
4861                 ifr->ifr_hwaddr.sa_family = dev->type;
4862                 return 0;
4863
4864         case SIOCGIFSLAVE:
4865                 err = -EINVAL;
4866                 break;
4867
4868         case SIOCGIFMAP:
4869                 ifr->ifr_map.mem_start = dev->mem_start;
4870                 ifr->ifr_map.mem_end   = dev->mem_end;
4871                 ifr->ifr_map.base_addr = dev->base_addr;
4872                 ifr->ifr_map.irq       = dev->irq;
4873                 ifr->ifr_map.dma       = dev->dma;
4874                 ifr->ifr_map.port      = dev->if_port;
4875                 return 0;
4876
4877         case SIOCGIFINDEX:
4878                 ifr->ifr_ifindex = dev->ifindex;
4879                 return 0;
4880
4881         case SIOCGIFTXQLEN:
4882                 ifr->ifr_qlen = dev->tx_queue_len;
4883                 return 0;
4884
4885         default:
4886                 /* dev_ioctl() should ensure this case
4887                  * is never reached
4888                  */
4889                 WARN_ON(1);
4890                 err = -ENOTTY;
4891                 break;
4892
4893         }
4894         return err;
4895 }
4896
4897 /*
4898  *      Perform the SIOCxIFxxx calls, inside rtnl_lock()
4899  */
4900 static int dev_ifsioc(struct net *net, struct ifreq *ifr, unsigned int cmd)
4901 {
4902         int err;
4903         struct net_device *dev = __dev_get_by_name(net, ifr->ifr_name);
4904         const struct net_device_ops *ops;
4905
4906         if (!dev)
4907                 return -ENODEV;
4908
4909         ops = dev->netdev_ops;
4910
4911         switch (cmd) {
4912         case SIOCSIFFLAGS:      /* Set interface flags */
4913                 return dev_change_flags(dev, ifr->ifr_flags);
4914
4915         case SIOCSIFMETRIC:     /* Set the metric on the interface
4916                                    (currently unused) */
4917                 return -EOPNOTSUPP;
4918
4919         case SIOCSIFMTU:        /* Set the MTU of a device */
4920                 return dev_set_mtu(dev, ifr->ifr_mtu);
4921
4922         case SIOCSIFHWADDR:
4923                 return dev_set_mac_address(dev, &ifr->ifr_hwaddr);
4924
4925         case SIOCSIFHWBROADCAST:
4926                 if (ifr->ifr_hwaddr.sa_family != dev->type)
4927                         return -EINVAL;
4928                 memcpy(dev->broadcast, ifr->ifr_hwaddr.sa_data,
4929                        min(sizeof ifr->ifr_hwaddr.sa_data, (size_t) dev->addr_len));
4930                 call_netdevice_notifiers(NETDEV_CHANGEADDR, dev);
4931                 return 0;
4932
4933         case SIOCSIFMAP:
4934                 if (ops->ndo_set_config) {
4935                         if (!netif_device_present(dev))
4936                                 return -ENODEV;
4937                         return ops->ndo_set_config(dev, &ifr->ifr_map);
4938                 }
4939                 return -EOPNOTSUPP;
4940
4941         case SIOCADDMULTI:
4942                 if (!ops->ndo_set_rx_mode ||
4943                     ifr->ifr_hwaddr.sa_family != AF_UNSPEC)
4944                         return -EINVAL;
4945                 if (!netif_device_present(dev))
4946                         return -ENODEV;
4947                 return dev_mc_add_global(dev, ifr->ifr_hwaddr.sa_data);
4948
4949         case SIOCDELMULTI:
4950                 if (!ops->ndo_set_rx_mode ||
4951                     ifr->ifr_hwaddr.sa_family != AF_UNSPEC)
4952                         return -EINVAL;
4953                 if (!netif_device_present(dev))
4954                         return -ENODEV;
4955                 return dev_mc_del_global(dev, ifr->ifr_hwaddr.sa_data);
4956
4957         case SIOCSIFTXQLEN:
4958                 if (ifr->ifr_qlen < 0)
4959                         return -EINVAL;
4960                 dev->tx_queue_len = ifr->ifr_qlen;
4961                 return 0;
4962
4963         case SIOCSIFNAME:
4964                 ifr->ifr_newname[IFNAMSIZ-1] = '\0';
4965                 return dev_change_name(dev, ifr->ifr_newname);
4966
4967         case SIOCSHWTSTAMP:
4968                 err = net_hwtstamp_validate(ifr);
4969                 if (err)
4970                         return err;
4971                 /* fall through */
4972
4973         /*
4974          *      Unknown or private ioctl
4975          */
4976         default:
4977                 if ((cmd >= SIOCDEVPRIVATE &&
4978                     cmd <= SIOCDEVPRIVATE + 15) ||
4979                     cmd == SIOCBONDENSLAVE ||
4980                     cmd == SIOCBONDRELEASE ||
4981                     cmd == SIOCBONDSETHWADDR ||
4982                     cmd == SIOCBONDSLAVEINFOQUERY ||
4983                     cmd == SIOCBONDINFOQUERY ||
4984                     cmd == SIOCBONDCHANGEACTIVE ||
4985                     cmd == SIOCGMIIPHY ||
4986                     cmd == SIOCGMIIREG ||
4987                     cmd == SIOCSMIIREG ||
4988                     cmd == SIOCBRADDIF ||
4989                     cmd == SIOCBRDELIF ||
4990                     cmd == SIOCSHWTSTAMP ||
4991                     cmd == SIOCWANDEV) {
4992                         err = -EOPNOTSUPP;
4993                         if (ops->ndo_do_ioctl) {
4994                                 if (netif_device_present(dev))
4995                                         err = ops->ndo_do_ioctl(dev, ifr, cmd);
4996                                 else
4997                                         err = -ENODEV;
4998                         }
4999                 } else
5000                         err = -EINVAL;
5001
5002         }
5003         return err;
5004 }
5005
5006 /*
5007  *      This function handles all "interface"-type I/O control requests. The actual
5008  *      'doing' part of this is dev_ifsioc above.
5009  */
5010
5011 /**
5012  *      dev_ioctl       -       network device ioctl
5013  *      @net: the applicable net namespace
5014  *      @cmd: command to issue
5015  *      @arg: pointer to a struct ifreq in user space
5016  *
5017  *      Issue ioctl functions to devices. This is normally called by the
5018  *      user space syscall interfaces but can sometimes be useful for
5019  *      other purposes. The return value is the return from the syscall if
5020  *      positive or a negative errno code on error.
5021  */
5022
5023 int dev_ioctl(struct net *net, unsigned int cmd, void __user *arg)
5024 {
5025         struct ifreq ifr;
5026         int ret;
5027         char *colon;
5028
5029         /* One special case: SIOCGIFCONF takes ifconf argument
5030            and requires shared lock, because it sleeps writing
5031            to user space.
5032          */
5033
5034         if (cmd == SIOCGIFCONF) {
5035                 rtnl_lock();
5036                 ret = dev_ifconf(net, (char __user *) arg);
5037                 rtnl_unlock();
5038                 return ret;
5039         }
5040         if (cmd == SIOCGIFNAME)
5041                 return dev_ifname(net, (struct ifreq __user *)arg);
5042
5043         if (copy_from_user(&ifr, arg, sizeof(struct ifreq)))
5044                 return -EFAULT;
5045
5046         ifr.ifr_name[IFNAMSIZ-1] = 0;
5047
5048         colon = strchr(ifr.ifr_name, ':');
5049         if (colon)
5050                 *colon = 0;
5051
5052         /*
5053          *      See which interface the caller is talking about.
5054          */
5055
5056         switch (cmd) {
5057         /*
5058          *      These ioctl calls:
5059          *      - can be done by all.
5060          *      - atomic and do not require locking.
5061          *      - return a value
5062          */
5063         case SIOCGIFFLAGS:
5064         case SIOCGIFMETRIC:
5065         case SIOCGIFMTU:
5066         case SIOCGIFHWADDR:
5067         case SIOCGIFSLAVE:
5068         case SIOCGIFMAP:
5069         case SIOCGIFINDEX:
5070         case SIOCGIFTXQLEN:
5071                 dev_load(net, ifr.ifr_name);
5072                 rcu_read_lock();
5073                 ret = dev_ifsioc_locked(net, &ifr, cmd);
5074                 rcu_read_unlock();
5075                 if (!ret) {
5076                         if (colon)
5077                                 *colon = ':';
5078                         if (copy_to_user(arg, &ifr,
5079                                          sizeof(struct ifreq)))
5080                                 ret = -EFAULT;
5081                 }
5082                 return ret;
5083
5084         case SIOCETHTOOL:
5085                 dev_load(net, ifr.ifr_name);
5086                 rtnl_lock();
5087                 ret = dev_ethtool(net, &ifr);
5088                 rtnl_unlock();
5089                 if (!ret) {
5090                         if (colon)
5091                                 *colon = ':';
5092                         if (copy_to_user(arg, &ifr,
5093                                          sizeof(struct ifreq)))
5094                                 ret = -EFAULT;
5095                 }
5096                 return ret;
5097
5098         /*
5099          *      These ioctl calls:
5100          *      - require superuser power.
5101          *      - require strict serialization.
5102          *      - return a value
5103          */
5104         case SIOCGMIIPHY:
5105         case SIOCGMIIREG:
5106         case SIOCSIFNAME:
5107                 if (!capable(CAP_NET_ADMIN))
5108                         return -EPERM;
5109                 dev_load(net, ifr.ifr_name);
5110                 rtnl_lock();
5111                 ret = dev_ifsioc(net, &ifr, cmd);
5112                 rtnl_unlock();
5113                 if (!ret) {
5114                         if (colon)
5115                                 *colon = ':';
5116                         if (copy_to_user(arg, &ifr,
5117                                          sizeof(struct ifreq)))
5118                                 ret = -EFAULT;
5119                 }
5120                 return ret;
5121
5122         /*
5123          *      These ioctl calls:
5124          *      - require superuser power.
5125          *      - require strict serialization.
5126          *      - do not return a value
5127          */
5128         case SIOCSIFFLAGS:
5129         case SIOCSIFMETRIC:
5130         case SIOCSIFMTU:
5131         case SIOCSIFMAP:
5132         case SIOCSIFHWADDR:
5133         case SIOCSIFSLAVE:
5134         case SIOCADDMULTI:
5135         case SIOCDELMULTI:
5136         case SIOCSIFHWBROADCAST:
5137         case SIOCSIFTXQLEN:
5138         case SIOCSMIIREG:
5139         case SIOCBONDENSLAVE:
5140         case SIOCBONDRELEASE:
5141         case SIOCBONDSETHWADDR:
5142         case SIOCBONDCHANGEACTIVE:
5143         case SIOCBRADDIF:
5144         case SIOCBRDELIF:
5145         case SIOCSHWTSTAMP:
5146                 if (!capable(CAP_NET_ADMIN))
5147                         return -EPERM;
5148                 /* fall through */
5149         case SIOCBONDSLAVEINFOQUERY:
5150         case SIOCBONDINFOQUERY:
5151                 dev_load(net, ifr.ifr_name);
5152                 rtnl_lock();
5153                 ret = dev_ifsioc(net, &ifr, cmd);
5154                 rtnl_unlock();
5155                 return ret;
5156
5157         case SIOCGIFMEM:
5158                 /* Get the per device memory space. We can add this but
5159                  * currently do not support it */
5160         case SIOCSIFMEM:
5161                 /* Set the per device memory buffer space.
5162                  * Not applicable in our case */
5163         case SIOCSIFLINK:
5164                 return -ENOTTY;
5165
5166         /*
5167          *      Unknown or private ioctl.
5168          */
5169         default:
5170                 if (cmd == SIOCWANDEV ||
5171                     (cmd >= SIOCDEVPRIVATE &&
5172                      cmd <= SIOCDEVPRIVATE + 15)) {
5173                         dev_load(net, ifr.ifr_name);
5174                         rtnl_lock();
5175                         ret = dev_ifsioc(net, &ifr, cmd);
5176                         rtnl_unlock();
5177                         if (!ret && copy_to_user(arg, &ifr,
5178                                                  sizeof(struct ifreq)))
5179                                 ret = -EFAULT;
5180                         return ret;
5181                 }
5182                 /* Take care of Wireless Extensions */
5183                 if (cmd >= SIOCIWFIRST && cmd <= SIOCIWLAST)
5184                         return wext_handle_ioctl(net, &ifr, cmd, arg);
5185                 return -ENOTTY;
5186         }
5187 }
5188
5189
5190 /**
5191  *      dev_new_index   -       allocate an ifindex
5192  *      @net: the applicable net namespace
5193  *
5194  *      Returns a suitable unique value for a new device interface
5195  *      number.  The caller must hold the rtnl semaphore or the
5196  *      dev_base_lock to be sure it remains unique.
5197  */
5198 static int dev_new_index(struct net *net)
5199 {
5200         static int ifindex;
5201         for (;;) {
5202                 if (++ifindex <= 0)
5203                         ifindex = 1;
5204                 if (!__dev_get_by_index(net, ifindex))
5205                         return ifindex;
5206         }
5207 }
5208
5209 /* Delayed registration/unregisteration */
5210 static LIST_HEAD(net_todo_list);
5211
5212 static void net_set_todo(struct net_device *dev)
5213 {
5214         list_add_tail(&dev->todo_list, &net_todo_list);
5215 }
5216
5217 static void rollback_registered_many(struct list_head *head)
5218 {
5219         struct net_device *dev, *tmp;
5220
5221         BUG_ON(dev_boot_phase);
5222         ASSERT_RTNL();
5223
5224         list_for_each_entry_safe(dev, tmp, head, unreg_list) {
5225                 /* Some devices call without registering
5226                  * for initialization unwind. Remove those
5227                  * devices and proceed with the remaining.
5228                  */
5229                 if (dev->reg_state == NETREG_UNINITIALIZED) {
5230                         pr_debug("unregister_netdevice: device %s/%p never "
5231                                  "was registered\n", dev->name, dev);
5232
5233                         WARN_ON(1);
5234                         list_del(&dev->unreg_list);
5235                         continue;
5236                 }
5237                 dev->dismantle = true;
5238                 BUG_ON(dev->reg_state != NETREG_REGISTERED);
5239         }
5240
5241         /* If device is running, close it first. */
5242         dev_close_many(head);
5243
5244         list_for_each_entry(dev, head, unreg_list) {
5245                 /* And unlink it from device chain. */
5246                 unlist_netdevice(dev);
5247
5248                 dev->reg_state = NETREG_UNREGISTERING;
5249         }
5250
5251         synchronize_net();
5252
5253         list_for_each_entry(dev, head, unreg_list) {
5254                 /* Shutdown queueing discipline. */
5255                 dev_shutdown(dev);
5256
5257
5258                 /* Notify protocols, that we are about to destroy
5259                    this device. They should clean all the things.
5260                 */
5261                 call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
5262
5263                 if (!dev->rtnl_link_ops ||
5264                     dev->rtnl_link_state == RTNL_LINK_INITIALIZED)
5265                         rtmsg_ifinfo(RTM_DELLINK, dev, ~0U);
5266
5267                 /*
5268                  *      Flush the unicast and multicast chains
5269                  */
5270                 dev_uc_flush(dev);
5271                 dev_mc_flush(dev);
5272
5273                 if (dev->netdev_ops->ndo_uninit)
5274                         dev->netdev_ops->ndo_uninit(dev);
5275
5276                 /* Notifier chain MUST detach us from master device. */
5277                 WARN_ON(dev->master);
5278
5279                 /* Remove entries from kobject tree */
5280                 netdev_unregister_kobject(dev);
5281         }
5282
5283         /* Process any work delayed until the end of the batch */
5284         dev = list_first_entry(head, struct net_device, unreg_list);
5285         call_netdevice_notifiers(NETDEV_UNREGISTER_BATCH, dev);
5286
5287         synchronize_net();
5288
5289         list_for_each_entry(dev, head, unreg_list)
5290                 dev_put(dev);
5291 }
5292
5293 static void rollback_registered(struct net_device *dev)
5294 {
5295         LIST_HEAD(single);
5296
5297         list_add(&dev->unreg_list, &single);
5298         rollback_registered_many(&single);
5299         list_del(&single);
5300 }
5301
5302 static netdev_features_t netdev_fix_features(struct net_device *dev,
5303         netdev_features_t features)
5304 {
5305         /* Fix illegal checksum combinations */
5306         if ((features & NETIF_F_HW_CSUM) &&
5307             (features & (NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))) {
5308                 netdev_warn(dev, "mixed HW and IP checksum settings.\n");
5309                 features &= ~(NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM);
5310         }
5311
5312         /* Fix illegal SG+CSUM combinations. */
5313         if ((features & NETIF_F_SG) &&
5314             !(features & NETIF_F_ALL_CSUM)) {
5315                 netdev_dbg(dev,
5316                         "Dropping NETIF_F_SG since no checksum feature.\n");
5317                 features &= ~NETIF_F_SG;
5318         }
5319
5320         /* TSO requires that SG is present as well. */
5321         if ((features & NETIF_F_ALL_TSO) && !(features & NETIF_F_SG)) {
5322                 netdev_dbg(dev, "Dropping TSO features since no SG feature.\n");
5323                 features &= ~NETIF_F_ALL_TSO;
5324         }
5325
5326         /* TSO ECN requires that TSO is present as well. */
5327         if ((features & NETIF_F_ALL_TSO) == NETIF_F_TSO_ECN)
5328                 features &= ~NETIF_F_TSO_ECN;
5329
5330         /* Software GSO depends on SG. */
5331         if ((features & NETIF_F_GSO) && !(features & NETIF_F_SG)) {
5332                 netdev_dbg(dev, "Dropping NETIF_F_GSO since no SG feature.\n");
5333                 features &= ~NETIF_F_GSO;
5334         }
5335
5336         /* UFO needs SG and checksumming */
5337         if (features & NETIF_F_UFO) {
5338                 /* maybe split UFO into V4 and V6? */
5339                 if (!((features & NETIF_F_GEN_CSUM) ||
5340                     (features & (NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))
5341                             == (NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))) {
5342                         netdev_dbg(dev,
5343                                 "Dropping NETIF_F_UFO since no checksum offload features.\n");
5344                         features &= ~NETIF_F_UFO;
5345                 }
5346
5347                 if (!(features & NETIF_F_SG)) {
5348                         netdev_dbg(dev,
5349                                 "Dropping NETIF_F_UFO since no NETIF_F_SG feature.\n");
5350                         features &= ~NETIF_F_UFO;
5351                 }
5352         }
5353
5354         return features;
5355 }
5356
5357 int __netdev_update_features(struct net_device *dev)
5358 {
5359         netdev_features_t features;
5360         int err = 0;
5361
5362         ASSERT_RTNL();
5363
5364         features = netdev_get_wanted_features(dev);
5365
5366         if (dev->netdev_ops->ndo_fix_features)
5367                 features = dev->netdev_ops->ndo_fix_features(dev, features);
5368
5369         /* driver might be less strict about feature dependencies */
5370         features = netdev_fix_features(dev, features);
5371
5372         if (dev->features == features)
5373                 return 0;
5374
5375         netdev_dbg(dev, "Features changed: %pNF -> %pNF\n",
5376                 &dev->features, &features);
5377
5378         if (dev->netdev_ops->ndo_set_features)
5379                 err = dev->netdev_ops->ndo_set_features(dev, features);
5380
5381         if (unlikely(err < 0)) {
5382                 netdev_err(dev,
5383                         "set_features() failed (%d); wanted %pNF, left %pNF\n",
5384                         err, &features, &dev->features);
5385                 return -1;
5386         }
5387
5388         if (!err)
5389                 dev->features = features;
5390
5391         return 1;
5392 }
5393
5394 /**
5395  *      netdev_update_features - recalculate device features
5396  *      @dev: the device to check
5397  *
5398  *      Recalculate dev->features set and send notifications if it
5399  *      has changed. Should be called after driver or hardware dependent
5400  *      conditions might have changed that influence the features.
5401  */
5402 void netdev_update_features(struct net_device *dev)
5403 {
5404         if (__netdev_update_features(dev))
5405                 netdev_features_change(dev);
5406 }
5407 EXPORT_SYMBOL(netdev_update_features);
5408
5409 /**
5410  *      netdev_change_features - recalculate device features
5411  *      @dev: the device to check
5412  *
5413  *      Recalculate dev->features set and send notifications even
5414  *      if they have not changed. Should be called instead of
5415  *      netdev_update_features() if also dev->vlan_features might
5416  *      have changed to allow the changes to be propagated to stacked
5417  *      VLAN devices.
5418  */
5419 void netdev_change_features(struct net_device *dev)
5420 {
5421         __netdev_update_features(dev);
5422         netdev_features_change(dev);
5423 }
5424 EXPORT_SYMBOL(netdev_change_features);
5425
5426 /**
5427  *      netif_stacked_transfer_operstate -      transfer operstate
5428  *      @rootdev: the root or lower level device to transfer state from
5429  *      @dev: the device to transfer operstate to
5430  *
5431  *      Transfer operational state from root to device. This is normally
5432  *      called when a stacking relationship exists between the root
5433  *      device and the device(a leaf device).
5434  */
5435 void netif_stacked_transfer_operstate(const struct net_device *rootdev,
5436                                         struct net_device *dev)
5437 {
5438         if (rootdev->operstate == IF_OPER_DORMANT)
5439                 netif_dormant_on(dev);
5440         else
5441                 netif_dormant_off(dev);
5442
5443         if (netif_carrier_ok(rootdev)) {
5444                 if (!netif_carrier_ok(dev))
5445                         netif_carrier_on(dev);
5446         } else {
5447                 if (netif_carrier_ok(dev))
5448                         netif_carrier_off(dev);
5449         }
5450 }
5451 EXPORT_SYMBOL(netif_stacked_transfer_operstate);
5452
5453 #ifdef CONFIG_RPS
5454 static int netif_alloc_rx_queues(struct net_device *dev)
5455 {
5456         unsigned int i, count = dev->num_rx_queues;
5457         struct netdev_rx_queue *rx;
5458
5459         BUG_ON(count < 1);
5460
5461         rx = kcalloc(count, sizeof(struct netdev_rx_queue), GFP_KERNEL);
5462         if (!rx) {
5463                 pr_err("netdev: Unable to allocate %u rx queues.\n", count);
5464                 return -ENOMEM;
5465         }
5466         dev->_rx = rx;
5467
5468         for (i = 0; i < count; i++)
5469                 rx[i].dev = dev;
5470         return 0;
5471 }
5472 #endif
5473
5474 static void netdev_init_one_queue(struct net_device *dev,
5475                                   struct netdev_queue *queue, void *_unused)
5476 {
5477         /* Initialize queue lock */
5478         spin_lock_init(&queue->_xmit_lock);
5479         netdev_set_xmit_lockdep_class(&queue->_xmit_lock, dev->type);
5480         queue->xmit_lock_owner = -1;
5481         netdev_queue_numa_node_write(queue, NUMA_NO_NODE);
5482         queue->dev = dev;
5483 #ifdef CONFIG_BQL
5484         dql_init(&queue->dql, HZ);
5485 #endif
5486 }
5487
5488 static int netif_alloc_netdev_queues(struct net_device *dev)
5489 {
5490         unsigned int count = dev->num_tx_queues;
5491         struct netdev_queue *tx;
5492
5493         BUG_ON(count < 1);
5494
5495         tx = kcalloc(count, sizeof(struct netdev_queue), GFP_KERNEL);
5496         if (!tx) {
5497                 pr_err("netdev: Unable to allocate %u tx queues.\n",
5498                        count);
5499                 return -ENOMEM;
5500         }
5501         dev->_tx = tx;
5502
5503         netdev_for_each_tx_queue(dev, netdev_init_one_queue, NULL);
5504         spin_lock_init(&dev->tx_global_lock);
5505
5506         return 0;
5507 }
5508
5509 /**
5510  *      register_netdevice      - register a network device
5511  *      @dev: device to register
5512  *
5513  *      Take a completed network device structure and add it to the kernel
5514  *      interfaces. A %NETDEV_REGISTER message is sent to the netdev notifier
5515  *      chain. 0 is returned on success. A negative errno code is returned
5516  *      on a failure to set up the device, or if the name is a duplicate.
5517  *
5518  *      Callers must hold the rtnl semaphore. You may want
5519  *      register_netdev() instead of this.
5520  *
5521  *      BUGS:
5522  *      The locking appears insufficient to guarantee two parallel registers
5523  *      will not get the same name.
5524  */
5525
5526 int register_netdevice(struct net_device *dev)
5527 {
5528         int ret;
5529         struct net *net = dev_net(dev);
5530
5531         BUG_ON(dev_boot_phase);
5532         ASSERT_RTNL();
5533
5534         might_sleep();
5535
5536         /* When net_device's are persistent, this will be fatal. */
5537         BUG_ON(dev->reg_state != NETREG_UNINITIALIZED);
5538         BUG_ON(!net);
5539
5540         spin_lock_init(&dev->addr_list_lock);
5541         netdev_set_addr_lockdep_class(dev);
5542
5543         dev->iflink = -1;
5544
5545         ret = dev_get_valid_name(dev, dev->name);
5546         if (ret < 0)
5547                 goto out;
5548
5549         /* Init, if this function is available */
5550         if (dev->netdev_ops->ndo_init) {
5551                 ret = dev->netdev_ops->ndo_init(dev);
5552                 if (ret) {
5553                         if (ret > 0)
5554                                 ret = -EIO;
5555                         goto out;
5556                 }
5557         }
5558
5559         dev->ifindex = dev_new_index(net);
5560         if (dev->iflink == -1)
5561                 dev->iflink = dev->ifindex;
5562
5563         /* Transfer changeable features to wanted_features and enable
5564          * software offloads (GSO and GRO).
5565          */
5566         dev->hw_features |= NETIF_F_SOFT_FEATURES;
5567         dev->features |= NETIF_F_SOFT_FEATURES;
5568         dev->wanted_features = dev->features & dev->hw_features;
5569
5570         /* Turn on no cache copy if HW is doing checksum */
5571         if (!(dev->flags & IFF_LOOPBACK)) {
5572                 dev->hw_features |= NETIF_F_NOCACHE_COPY;
5573                 if (dev->features & NETIF_F_ALL_CSUM) {
5574                         dev->wanted_features |= NETIF_F_NOCACHE_COPY;
5575                         dev->features |= NETIF_F_NOCACHE_COPY;
5576                 }
5577         }
5578
5579         /* Make NETIF_F_HIGHDMA inheritable to VLAN devices.
5580          */
5581         dev->vlan_features |= NETIF_F_HIGHDMA;
5582
5583         ret = call_netdevice_notifiers(NETDEV_POST_INIT, dev);
5584         ret = notifier_to_errno(ret);
5585         if (ret)
5586                 goto err_uninit;
5587
5588         ret = netdev_register_kobject(dev);
5589         if (ret)
5590                 goto err_uninit;
5591         dev->reg_state = NETREG_REGISTERED;
5592
5593         __netdev_update_features(dev);
5594
5595         /*
5596          *      Default initial state at registry is that the
5597          *      device is present.
5598          */
5599
5600         set_bit(__LINK_STATE_PRESENT, &dev->state);
5601
5602         dev_init_scheduler(dev);
5603         dev_hold(dev);
5604         list_netdevice(dev);
5605
5606         /* Notify protocols, that a new device appeared. */
5607         ret = call_netdevice_notifiers(NETDEV_REGISTER, dev);
5608         ret = notifier_to_errno(ret);
5609         if (ret) {
5610                 rollback_registered(dev);
5611                 dev->reg_state = NETREG_UNREGISTERED;
5612         }
5613         /*
5614          *      Prevent userspace races by waiting until the network
5615          *      device is fully setup before sending notifications.
5616          */
5617         if (!dev->rtnl_link_ops ||
5618             dev->rtnl_link_state == RTNL_LINK_INITIALIZED)
5619                 rtmsg_ifinfo(RTM_NEWLINK, dev, ~0U);
5620
5621 out:
5622         return ret;
5623
5624 err_uninit:
5625         if (dev->netdev_ops->ndo_uninit)
5626                 dev->netdev_ops->ndo_uninit(dev);
5627         goto out;
5628 }
5629 EXPORT_SYMBOL(register_netdevice);
5630
5631 /**
5632  *      init_dummy_netdev       - init a dummy network device for NAPI
5633  *      @dev: device to init
5634  *
5635  *      This takes a network device structure and initialize the minimum
5636  *      amount of fields so it can be used to schedule NAPI polls without
5637  *      registering a full blown interface. This is to be used by drivers
5638  *      that need to tie several hardware interfaces to a single NAPI
5639  *      poll scheduler due to HW limitations.
5640  */
5641 int init_dummy_netdev(struct net_device *dev)
5642 {
5643         /* Clear everything. Note we don't initialize spinlocks
5644          * are they aren't supposed to be taken by any of the
5645          * NAPI code and this dummy netdev is supposed to be
5646          * only ever used for NAPI polls
5647          */
5648         memset(dev, 0, sizeof(struct net_device));
5649
5650         /* make sure we BUG if trying to hit standard
5651          * register/unregister code path
5652          */
5653         dev->reg_state = NETREG_DUMMY;
5654
5655         /* NAPI wants this */
5656         INIT_LIST_HEAD(&dev->napi_list);
5657
5658         /* a dummy interface is started by default */
5659         set_bit(__LINK_STATE_PRESENT, &dev->state);
5660         set_bit(__LINK_STATE_START, &dev->state);
5661
5662         /* Note : We dont allocate pcpu_refcnt for dummy devices,
5663          * because users of this 'device' dont need to change
5664          * its refcount.
5665          */
5666
5667         return 0;
5668 }
5669 EXPORT_SYMBOL_GPL(init_dummy_netdev);
5670
5671
5672 /**
5673  *      register_netdev - register a network device
5674  *      @dev: device to register
5675  *
5676  *      Take a completed network device structure and add it to the kernel
5677  *      interfaces. A %NETDEV_REGISTER message is sent to the netdev notifier
5678  *      chain. 0 is returned on success. A negative errno code is returned
5679  *      on a failure to set up the device, or if the name is a duplicate.
5680  *
5681  *      This is a wrapper around register_netdevice that takes the rtnl semaphore
5682  *      and expands the device name if you passed a format string to
5683  *      alloc_netdev.
5684  */
5685 int register_netdev(struct net_device *dev)
5686 {
5687         int err;
5688
5689         rtnl_lock();
5690         err = register_netdevice(dev);
5691         rtnl_unlock();
5692         return err;
5693 }
5694 EXPORT_SYMBOL(register_netdev);
5695
5696 int netdev_refcnt_read(const struct net_device *dev)
5697 {
5698         int i, refcnt = 0;
5699
5700         for_each_possible_cpu(i)
5701                 refcnt += *per_cpu_ptr(dev->pcpu_refcnt, i);
5702         return refcnt;
5703 }
5704 EXPORT_SYMBOL(netdev_refcnt_read);
5705
5706 /*
5707  * netdev_wait_allrefs - wait until all references are gone.
5708  *
5709  * This is called when unregistering network devices.
5710  *
5711  * Any protocol or device that holds a reference should register
5712  * for netdevice notification, and cleanup and put back the
5713  * reference if they receive an UNREGISTER event.
5714  * We can get stuck here if buggy protocols don't correctly
5715  * call dev_put.
5716  */
5717 static void netdev_wait_allrefs(struct net_device *dev)
5718 {
5719         unsigned long rebroadcast_time, warning_time;
5720         int refcnt;
5721
5722         linkwatch_forget_dev(dev);
5723
5724         rebroadcast_time = warning_time = jiffies;
5725         refcnt = netdev_refcnt_read(dev);
5726
5727         while (refcnt != 0) {
5728                 if (time_after(jiffies, rebroadcast_time + 1 * HZ)) {
5729                         rtnl_lock();
5730
5731                         /* Rebroadcast unregister notification */
5732                         call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
5733                         /* don't resend NETDEV_UNREGISTER_BATCH, _BATCH users
5734                          * should have already handle it the first time */
5735
5736                         if (test_bit(__LINK_STATE_LINKWATCH_PENDING,
5737                                      &dev->state)) {
5738                                 /* We must not have linkwatch events
5739                                  * pending on unregister. If this
5740                                  * happens, we simply run the queue
5741                                  * unscheduled, resulting in a noop
5742                                  * for this device.
5743                                  */
5744                                 linkwatch_run_queue();
5745                         }
5746
5747                         __rtnl_unlock();
5748
5749                         rebroadcast_time = jiffies;
5750                 }
5751
5752                 msleep(250);
5753
5754                 refcnt = netdev_refcnt_read(dev);
5755
5756                 if (time_after(jiffies, warning_time + 10 * HZ)) {
5757                         printk(KERN_EMERG "unregister_netdevice: "
5758                                "waiting for %s to become free. Usage "
5759                                "count = %d\n",
5760                                dev->name, refcnt);
5761                         warning_time = jiffies;
5762                 }
5763         }
5764 }
5765
5766 /* The sequence is:
5767  *
5768  *      rtnl_lock();
5769  *      ...
5770  *      register_netdevice(x1);
5771  *      register_netdevice(x2);
5772  *      ...
5773  *      unregister_netdevice(y1);
5774  *      unregister_netdevice(y2);
5775  *      ...
5776  *      rtnl_unlock();
5777  *      free_netdev(y1);
5778  *      free_netdev(y2);
5779  *
5780  * We are invoked by rtnl_unlock().
5781  * This allows us to deal with problems:
5782  * 1) We can delete sysfs objects which invoke hotplug
5783  *    without deadlocking with linkwatch via keventd.
5784  * 2) Since we run with the RTNL semaphore not held, we can sleep
5785  *    safely in order to wait for the netdev refcnt to drop to zero.
5786  *
5787  * We must not return until all unregister events added during
5788  * the interval the lock was held have been completed.
5789  */
5790 void netdev_run_todo(void)
5791 {
5792         struct list_head list;
5793
5794         /* Snapshot list, allow later requests */
5795         list_replace_init(&net_todo_list, &list);
5796
5797         __rtnl_unlock();
5798
5799         /* Wait for rcu callbacks to finish before attempting to drain
5800          * the device list.  This usually avoids a 250ms wait.
5801          */
5802         if (!list_empty(&list))
5803                 rcu_barrier();
5804
5805         while (!list_empty(&list)) {
5806                 struct net_device *dev
5807                         = list_first_entry(&list, struct net_device, todo_list);
5808                 list_del(&dev->todo_list);
5809
5810                 if (unlikely(dev->reg_state != NETREG_UNREGISTERING)) {
5811                         printk(KERN_ERR "network todo '%s' but state %d\n",
5812                                dev->name, dev->reg_state);
5813                         dump_stack();
5814                         continue;
5815                 }
5816
5817                 dev->reg_state = NETREG_UNREGISTERED;
5818
5819                 on_each_cpu(flush_backlog, dev, 1);
5820
5821                 netdev_wait_allrefs(dev);
5822
5823                 /* paranoia */
5824                 BUG_ON(netdev_refcnt_read(dev));
5825                 WARN_ON(rcu_access_pointer(dev->ip_ptr));
5826                 WARN_ON(rcu_access_pointer(dev->ip6_ptr));
5827                 WARN_ON(dev->dn_ptr);
5828
5829                 if (dev->destructor)
5830                         dev->destructor(dev);
5831
5832                 /* Free network device */
5833                 kobject_put(&dev->dev.kobj);
5834         }
5835 }
5836
5837 /* Convert net_device_stats to rtnl_link_stats64.  They have the same
5838  * fields in the same order, with only the type differing.
5839  */
5840 static void netdev_stats_to_stats64(struct rtnl_link_stats64 *stats64,
5841                                     const struct net_device_stats *netdev_stats)
5842 {
5843 #if BITS_PER_LONG == 64
5844         BUILD_BUG_ON(sizeof(*stats64) != sizeof(*netdev_stats));
5845         memcpy(stats64, netdev_stats, sizeof(*stats64));
5846 #else
5847         size_t i, n = sizeof(*stats64) / sizeof(u64);
5848         const unsigned long *src = (const unsigned long *)netdev_stats;
5849         u64 *dst = (u64 *)stats64;
5850
5851         BUILD_BUG_ON(sizeof(*netdev_stats) / sizeof(unsigned long) !=
5852                      sizeof(*stats64) / sizeof(u64));
5853         for (i = 0; i < n; i++)
5854                 dst[i] = src[i];
5855 #endif
5856 }
5857
5858 /**
5859  *      dev_get_stats   - get network device statistics
5860  *      @dev: device to get statistics from
5861  *      @storage: place to store stats
5862  *
5863  *      Get network statistics from device. Return @storage.
5864  *      The device driver may provide its own method by setting
5865  *      dev->netdev_ops->get_stats64 or dev->netdev_ops->get_stats;
5866  *      otherwise the internal statistics structure is used.
5867  */
5868 struct rtnl_link_stats64 *dev_get_stats(struct net_device *dev,
5869                                         struct rtnl_link_stats64 *storage)
5870 {
5871         const struct net_device_ops *ops = dev->netdev_ops;
5872
5873         if (ops->ndo_get_stats64) {
5874                 memset(storage, 0, sizeof(*storage));
5875                 ops->ndo_get_stats64(dev, storage);
5876         } else if (ops->ndo_get_stats) {
5877                 netdev_stats_to_stats64(storage, ops->ndo_get_stats(dev));
5878         } else {
5879                 netdev_stats_to_stats64(storage, &dev->stats);
5880         }
5881         storage->rx_dropped += atomic_long_read(&dev->rx_dropped);
5882         return storage;
5883 }
5884 EXPORT_SYMBOL(dev_get_stats);
5885
5886 struct netdev_queue *dev_ingress_queue_create(struct net_device *dev)
5887 {
5888         struct netdev_queue *queue = dev_ingress_queue(dev);
5889
5890 #ifdef CONFIG_NET_CLS_ACT
5891         if (queue)
5892                 return queue;
5893         queue = kzalloc(sizeof(*queue), GFP_KERNEL);
5894         if (!queue)
5895                 return NULL;
5896         netdev_init_one_queue(dev, queue, NULL);
5897         queue->qdisc = &noop_qdisc;
5898         queue->qdisc_sleeping = &noop_qdisc;
5899         rcu_assign_pointer(dev->ingress_queue, queue);
5900 #endif
5901         return queue;
5902 }
5903
5904 /**
5905  *      alloc_netdev_mqs - allocate network device
5906  *      @sizeof_priv:   size of private data to allocate space for
5907  *      @name:          device name format string
5908  *      @setup:         callback to initialize device
5909  *      @txqs:          the number of TX subqueues to allocate
5910  *      @rxqs:          the number of RX subqueues to allocate
5911  *
5912  *      Allocates a struct net_device with private data area for driver use
5913  *      and performs basic initialization.  Also allocates subquue structs
5914  *      for each queue on the device.
5915  */
5916 struct net_device *alloc_netdev_mqs(int sizeof_priv, const char *name,
5917                 void (*setup)(struct net_device *),
5918                 unsigned int txqs, unsigned int rxqs)
5919 {
5920         struct net_device *dev;
5921         size_t alloc_size;
5922         struct net_device *p;
5923
5924         BUG_ON(strlen(name) >= sizeof(dev->name));
5925
5926         if (txqs < 1) {
5927                 pr_err("alloc_netdev: Unable to allocate device "
5928                        "with zero queues.\n");
5929                 return NULL;
5930         }
5931
5932 #ifdef CONFIG_RPS
5933         if (rxqs < 1) {
5934                 pr_err("alloc_netdev: Unable to allocate device "
5935                        "with zero RX queues.\n");
5936                 return NULL;
5937         }
5938 #endif
5939
5940         alloc_size = sizeof(struct net_device);
5941         if (sizeof_priv) {
5942                 /* ensure 32-byte alignment of private area */
5943                 alloc_size = ALIGN(alloc_size, NETDEV_ALIGN);
5944                 alloc_size += sizeof_priv;
5945         }
5946         /* ensure 32-byte alignment of whole construct */
5947         alloc_size += NETDEV_ALIGN - 1;
5948
5949         p = kzalloc(alloc_size, GFP_KERNEL);
5950         if (!p) {
5951                 printk(KERN_ERR "alloc_netdev: Unable to allocate device.\n");
5952                 return NULL;
5953         }
5954
5955         dev = PTR_ALIGN(p, NETDEV_ALIGN);
5956         dev->padded = (char *)dev - (char *)p;
5957
5958         dev->pcpu_refcnt = alloc_percpu(int);
5959         if (!dev->pcpu_refcnt)
5960                 goto free_p;
5961
5962         if (dev_addr_init(dev))
5963                 goto free_pcpu;
5964
5965         dev_mc_init(dev);
5966         dev_uc_init(dev);
5967
5968         dev_net_set(dev, &init_net);
5969
5970         dev->gso_max_size = GSO_MAX_SIZE;
5971
5972         INIT_LIST_HEAD(&dev->napi_list);
5973         INIT_LIST_HEAD(&dev->unreg_list);
5974         INIT_LIST_HEAD(&dev->link_watch_list);
5975         dev->priv_flags = IFF_XMIT_DST_RELEASE;
5976         setup(dev);
5977
5978         dev->num_tx_queues = txqs;
5979         dev->real_num_tx_queues = txqs;
5980         if (netif_alloc_netdev_queues(dev))
5981                 goto free_all;
5982
5983 #ifdef CONFIG_RPS
5984         dev->num_rx_queues = rxqs;
5985         dev->real_num_rx_queues = rxqs;
5986         if (netif_alloc_rx_queues(dev))
5987                 goto free_all;
5988 #endif
5989
5990         strcpy(dev->name, name);
5991         dev->group = INIT_NETDEV_GROUP;
5992         return dev;
5993
5994 free_all:
5995         free_netdev(dev);
5996         return NULL;
5997
5998 free_pcpu:
5999         free_percpu(dev->pcpu_refcnt);
6000         kfree(dev->_tx);
6001 #ifdef CONFIG_RPS
6002         kfree(dev->_rx);
6003 #endif
6004
6005 free_p:
6006         kfree(p);
6007         return NULL;
6008 }
6009 EXPORT_SYMBOL(alloc_netdev_mqs);
6010
6011 /**
6012  *      free_netdev - free network device
6013  *      @dev: device
6014  *
6015  *      This function does the last stage of destroying an allocated device
6016  *      interface. The reference to the device object is released.
6017  *      If this is the last reference then it will be freed.
6018  */
6019 void free_netdev(struct net_device *dev)
6020 {
6021         struct napi_struct *p, *n;
6022
6023         release_net(dev_net(dev));
6024
6025         kfree(dev->_tx);
6026 #ifdef CONFIG_RPS
6027         kfree(dev->_rx);
6028 #endif
6029
6030         kfree(rcu_dereference_protected(dev->ingress_queue, 1));
6031
6032         /* Flush device addresses */
6033         dev_addr_flush(dev);
6034
6035         list_for_each_entry_safe(p, n, &dev->napi_list, dev_list)
6036                 netif_napi_del(p);
6037
6038         free_percpu(dev->pcpu_refcnt);
6039         dev->pcpu_refcnt = NULL;
6040
6041         /*  Compatibility with error handling in drivers */
6042         if (dev->reg_state == NETREG_UNINITIALIZED) {
6043                 kfree((char *)dev - dev->padded);
6044                 return;
6045         }
6046
6047         BUG_ON(dev->reg_state != NETREG_UNREGISTERED);
6048         dev->reg_state = NETREG_RELEASED;
6049
6050         /* will free via device release */
6051         put_device(&dev->dev);
6052 }
6053 EXPORT_SYMBOL(free_netdev);
6054
6055 /**
6056  *      synchronize_net -  Synchronize with packet receive processing
6057  *
6058  *      Wait for packets currently being received to be done.
6059  *      Does not block later packets from starting.
6060  */
6061 void synchronize_net(void)
6062 {
6063         might_sleep();
6064         if (rtnl_is_locked())
6065                 synchronize_rcu_expedited();
6066         else
6067                 synchronize_rcu();
6068 }
6069 EXPORT_SYMBOL(synchronize_net);
6070
6071 /**
6072  *      unregister_netdevice_queue - remove device from the kernel
6073  *      @dev: device
6074  *      @head: list
6075  *
6076  *      This function shuts down a device interface and removes it
6077  *      from the kernel tables.
6078  *      If head not NULL, device is queued to be unregistered later.
6079  *
6080  *      Callers must hold the rtnl semaphore.  You may want
6081  *      unregister_netdev() instead of this.
6082  */
6083
6084 void unregister_netdevice_queue(struct net_device *dev, struct list_head *head)
6085 {
6086         ASSERT_RTNL();
6087
6088         if (head) {
6089                 list_move_tail(&dev->unreg_list, head);
6090         } else {
6091                 rollback_registered(dev);
6092                 /* Finish processing unregister after unlock */
6093                 net_set_todo(dev);
6094         }
6095 }
6096 EXPORT_SYMBOL(unregister_netdevice_queue);
6097
6098 /**
6099  *      unregister_netdevice_many - unregister many devices
6100  *      @head: list of devices
6101  */
6102 void unregister_netdevice_many(struct list_head *head)
6103 {
6104         struct net_device *dev;
6105
6106         if (!list_empty(head)) {
6107                 rollback_registered_many(head);
6108                 list_for_each_entry(dev, head, unreg_list)
6109                         net_set_todo(dev);
6110         }
6111 }
6112 EXPORT_SYMBOL(unregister_netdevice_many);
6113
6114 /**
6115  *      unregister_netdev - remove device from the kernel
6116  *      @dev: device
6117  *
6118  *      This function shuts down a device interface and removes it
6119  *      from the kernel tables.
6120  *
6121  *      This is just a wrapper for unregister_netdevice that takes
6122  *      the rtnl semaphore.  In general you want to use this and not
6123  *      unregister_netdevice.
6124  */
6125 void unregister_netdev(struct net_device *dev)
6126 {
6127         rtnl_lock();
6128         unregister_netdevice(dev);
6129         rtnl_unlock();
6130 }
6131 EXPORT_SYMBOL(unregister_netdev);
6132
6133 /**
6134  *      dev_change_net_namespace - move device to different nethost namespace
6135  *      @dev: device
6136  *      @net: network namespace
6137  *      @pat: If not NULL name pattern to try if the current device name
6138  *            is already taken in the destination network namespace.
6139  *
6140  *      This function shuts down a device interface and moves it
6141  *      to a new network namespace. On success 0 is returned, on
6142  *      a failure a netagive errno code is returned.
6143  *
6144  *      Callers must hold the rtnl semaphore.
6145  */
6146
6147 int dev_change_net_namespace(struct net_device *dev, struct net *net, const char *pat)
6148 {
6149         int err;
6150
6151         ASSERT_RTNL();
6152
6153         /* Don't allow namespace local devices to be moved. */
6154         err = -EINVAL;
6155         if (dev->features & NETIF_F_NETNS_LOCAL)
6156                 goto out;
6157
6158         /* Ensure the device has been registrered */
6159         err = -EINVAL;
6160         if (dev->reg_state != NETREG_REGISTERED)
6161                 goto out;
6162
6163         /* Get out if there is nothing todo */
6164         err = 0;
6165         if (net_eq(dev_net(dev), net))
6166                 goto out;
6167
6168         /* Pick the destination device name, and ensure
6169          * we can use it in the destination network namespace.
6170          */
6171         err = -EEXIST;
6172         if (__dev_get_by_name(net, dev->name)) {
6173                 /* We get here if we can't use the current device name */
6174                 if (!pat)
6175                         goto out;
6176                 if (dev_get_valid_name(dev, pat) < 0)
6177                         goto out;
6178         }
6179
6180         /*
6181          * And now a mini version of register_netdevice unregister_netdevice.
6182          */
6183
6184         /* If device is running close it first. */
6185         dev_close(dev);
6186
6187         /* And unlink it from device chain */
6188         err = -ENODEV;
6189         unlist_netdevice(dev);
6190
6191         synchronize_net();
6192
6193         /* Shutdown queueing discipline. */
6194         dev_shutdown(dev);
6195
6196         /* Notify protocols, that we are about to destroy
6197            this device. They should clean all the things.
6198
6199            Note that dev->reg_state stays at NETREG_REGISTERED.
6200            This is wanted because this way 8021q and macvlan know
6201            the device is just moving and can keep their slaves up.
6202         */
6203         call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
6204         call_netdevice_notifiers(NETDEV_UNREGISTER_BATCH, dev);
6205         rtmsg_ifinfo(RTM_DELLINK, dev, ~0U);
6206
6207         /*
6208          *      Flush the unicast and multicast chains
6209          */
6210         dev_uc_flush(dev);
6211         dev_mc_flush(dev);
6212
6213         /* Actually switch the network namespace */
6214         dev_net_set(dev, net);
6215
6216         /* If there is an ifindex conflict assign a new one */
6217         if (__dev_get_by_index(net, dev->ifindex)) {
6218                 int iflink = (dev->iflink == dev->ifindex);
6219                 dev->ifindex = dev_new_index(net);
6220                 if (iflink)
6221                         dev->iflink = dev->ifindex;
6222         }
6223
6224         /* Fixup kobjects */
6225         err = device_rename(&dev->dev, dev->name);
6226         WARN_ON(err);
6227
6228         /* Add the device back in the hashes */
6229         list_netdevice(dev);
6230
6231         /* Notify protocols, that a new device appeared. */
6232         call_netdevice_notifiers(NETDEV_REGISTER, dev);
6233
6234         /*
6235          *      Prevent userspace races by waiting until the network
6236          *      device is fully setup before sending notifications.
6237          */
6238         rtmsg_ifinfo(RTM_NEWLINK, dev, ~0U);
6239
6240         synchronize_net();
6241         err = 0;
6242 out:
6243         return err;
6244 }
6245 EXPORT_SYMBOL_GPL(dev_change_net_namespace);
6246
6247 static int dev_cpu_callback(struct notifier_block *nfb,
6248                             unsigned long action,
6249                             void *ocpu)
6250 {
6251         struct sk_buff **list_skb;
6252         struct sk_buff *skb;
6253         unsigned int cpu, oldcpu = (unsigned long)ocpu;
6254         struct softnet_data *sd, *oldsd;
6255
6256         if (action != CPU_DEAD && action != CPU_DEAD_FROZEN)
6257                 return NOTIFY_OK;
6258
6259         local_irq_disable();
6260         cpu = smp_processor_id();
6261         sd = &per_cpu(softnet_data, cpu);
6262         oldsd = &per_cpu(softnet_data, oldcpu);
6263
6264         /* Find end of our completion_queue. */
6265         list_skb = &sd->completion_queue;
6266         while (*list_skb)
6267                 list_skb = &(*list_skb)->next;
6268         /* Append completion queue from offline CPU. */
6269         *list_skb = oldsd->completion_queue;
6270         oldsd->completion_queue = NULL;
6271
6272         /* Append output queue from offline CPU. */
6273         if (oldsd->output_queue) {
6274                 *sd->output_queue_tailp = oldsd->output_queue;
6275                 sd->output_queue_tailp = oldsd->output_queue_tailp;
6276                 oldsd->output_queue = NULL;
6277                 oldsd->output_queue_tailp = &oldsd->output_queue;
6278         }
6279         /* Append NAPI poll list from offline CPU. */
6280         if (!list_empty(&oldsd->poll_list)) {
6281                 list_splice_init(&oldsd->poll_list, &sd->poll_list);
6282                 raise_softirq_irqoff(NET_RX_SOFTIRQ);
6283         }
6284
6285         raise_softirq_irqoff(NET_TX_SOFTIRQ);
6286         local_irq_enable();
6287
6288         /* Process offline CPU's input_pkt_queue */
6289         while ((skb = __skb_dequeue(&oldsd->process_queue))) {
6290                 netif_rx(skb);
6291                 input_queue_head_incr(oldsd);
6292         }
6293         while ((skb = __skb_dequeue(&oldsd->input_pkt_queue))) {
6294                 netif_rx(skb);
6295                 input_queue_head_incr(oldsd);
6296         }
6297
6298         return NOTIFY_OK;
6299 }
6300
6301
6302 /**
6303  *      netdev_increment_features - increment feature set by one
6304  *      @all: current feature set
6305  *      @one: new feature set
6306  *      @mask: mask feature set
6307  *
6308  *      Computes a new feature set after adding a device with feature set
6309  *      @one to the master device with current feature set @all.  Will not
6310  *      enable anything that is off in @mask. Returns the new feature set.
6311  */
6312 netdev_features_t netdev_increment_features(netdev_features_t all,
6313         netdev_features_t one, netdev_features_t mask)
6314 {
6315         if (mask & NETIF_F_GEN_CSUM)
6316                 mask |= NETIF_F_ALL_CSUM;
6317         mask |= NETIF_F_VLAN_CHALLENGED;
6318
6319         all |= one & (NETIF_F_ONE_FOR_ALL|NETIF_F_ALL_CSUM) & mask;
6320         all &= one | ~NETIF_F_ALL_FOR_ALL;
6321
6322         /* If one device supports hw checksumming, set for all. */
6323         if (all & NETIF_F_GEN_CSUM)
6324                 all &= ~(NETIF_F_ALL_CSUM & ~NETIF_F_GEN_CSUM);
6325
6326         return all;
6327 }
6328 EXPORT_SYMBOL(netdev_increment_features);
6329
6330 static struct hlist_head *netdev_create_hash(void)
6331 {
6332         int i;
6333         struct hlist_head *hash;
6334
6335         hash = kmalloc(sizeof(*hash) * NETDEV_HASHENTRIES, GFP_KERNEL);
6336         if (hash != NULL)
6337                 for (i = 0; i < NETDEV_HASHENTRIES; i++)
6338                         INIT_HLIST_HEAD(&hash[i]);
6339
6340         return hash;
6341 }
6342
6343 /* Initialize per network namespace state */
6344 static int __net_init netdev_init(struct net *net)
6345 {
6346         INIT_LIST_HEAD(&net->dev_base_head);
6347
6348         net->dev_name_head = netdev_create_hash();
6349         if (net->dev_name_head == NULL)
6350                 goto err_name;
6351
6352         net->dev_index_head = netdev_create_hash();
6353         if (net->dev_index_head == NULL)
6354                 goto err_idx;
6355
6356         return 0;
6357
6358 err_idx:
6359         kfree(net->dev_name_head);
6360 err_name:
6361         return -ENOMEM;
6362 }
6363
6364 /**
6365  *      netdev_drivername - network driver for the device
6366  *      @dev: network device
6367  *
6368  *      Determine network driver for device.
6369  */
6370 const char *netdev_drivername(const struct net_device *dev)
6371 {
6372         const struct device_driver *driver;
6373         const struct device *parent;
6374         const char *empty = "";
6375
6376         parent = dev->dev.parent;
6377         if (!parent)
6378                 return empty;
6379
6380         driver = parent->driver;
6381         if (driver && driver->name)
6382                 return driver->name;
6383         return empty;
6384 }
6385
6386 int __netdev_printk(const char *level, const struct net_device *dev,
6387                            struct va_format *vaf)
6388 {
6389         int r;
6390
6391         if (dev && dev->dev.parent)
6392                 r = dev_printk(level, dev->dev.parent, "%s: %pV",
6393                                netdev_name(dev), vaf);
6394         else if (dev)
6395                 r = printk("%s%s: %pV", level, netdev_name(dev), vaf);
6396         else
6397                 r = printk("%s(NULL net_device): %pV", level, vaf);
6398
6399         return r;
6400 }
6401 EXPORT_SYMBOL(__netdev_printk);
6402
6403 int netdev_printk(const char *level, const struct net_device *dev,
6404                   const char *format, ...)
6405 {
6406         struct va_format vaf;
6407         va_list args;
6408         int r;
6409
6410         va_start(args, format);
6411
6412         vaf.fmt = format;
6413         vaf.va = &args;
6414
6415         r = __netdev_printk(level, dev, &vaf);
6416         va_end(args);
6417
6418         return r;
6419 }
6420 EXPORT_SYMBOL(netdev_printk);
6421
6422 #define define_netdev_printk_level(func, level)                 \
6423 int func(const struct net_device *dev, const char *fmt, ...)    \
6424 {                                                               \
6425         int r;                                                  \
6426         struct va_format vaf;                                   \
6427         va_list args;                                           \
6428                                                                 \
6429         va_start(args, fmt);                                    \
6430                                                                 \
6431         vaf.fmt = fmt;                                          \
6432         vaf.va = &args;                                         \
6433                                                                 \
6434         r = __netdev_printk(level, dev, &vaf);                  \
6435         va_end(args);                                           \
6436                                                                 \
6437         return r;                                               \
6438 }                                                               \
6439 EXPORT_SYMBOL(func);
6440
6441 define_netdev_printk_level(netdev_emerg, KERN_EMERG);
6442 define_netdev_printk_level(netdev_alert, KERN_ALERT);
6443 define_netdev_printk_level(netdev_crit, KERN_CRIT);
6444 define_netdev_printk_level(netdev_err, KERN_ERR);
6445 define_netdev_printk_level(netdev_warn, KERN_WARNING);
6446 define_netdev_printk_level(netdev_notice, KERN_NOTICE);
6447 define_netdev_printk_level(netdev_info, KERN_INFO);
6448
6449 static void __net_exit netdev_exit(struct net *net)
6450 {
6451         kfree(net->dev_name_head);
6452         kfree(net->dev_index_head);
6453 }
6454
6455 static struct pernet_operations __net_initdata netdev_net_ops = {
6456         .init = netdev_init,
6457         .exit = netdev_exit,
6458 };
6459
6460 static void __net_exit default_device_exit(struct net *net)
6461 {
6462         struct net_device *dev, *aux;
6463         /*
6464          * Push all migratable network devices back to the
6465          * initial network namespace
6466          */
6467         rtnl_lock();
6468         for_each_netdev_safe(net, dev, aux) {
6469                 int err;
6470                 char fb_name[IFNAMSIZ];
6471
6472                 /* Ignore unmoveable devices (i.e. loopback) */
6473                 if (dev->features & NETIF_F_NETNS_LOCAL)
6474                         continue;
6475
6476                 /* Leave virtual devices for the generic cleanup */
6477                 if (dev->rtnl_link_ops)
6478                         continue;
6479
6480                 /* Push remaining network devices to init_net */
6481                 snprintf(fb_name, IFNAMSIZ, "dev%d", dev->ifindex);
6482                 err = dev_change_net_namespace(dev, &init_net, fb_name);
6483                 if (err) {
6484                         printk(KERN_EMERG "%s: failed to move %s to init_net: %d\n",
6485                                 __func__, dev->name, err);
6486                         BUG();
6487                 }
6488         }
6489         rtnl_unlock();
6490 }
6491
6492 static void __net_exit default_device_exit_batch(struct list_head *net_list)
6493 {
6494         /* At exit all network devices most be removed from a network
6495          * namespace.  Do this in the reverse order of registration.
6496          * Do this across as many network namespaces as possible to
6497          * improve batching efficiency.
6498          */
6499         struct net_device *dev;
6500         struct net *net;
6501         LIST_HEAD(dev_kill_list);
6502
6503         rtnl_lock();
6504         list_for_each_entry(net, net_list, exit_list) {
6505                 for_each_netdev_reverse(net, dev) {
6506                         if (dev->rtnl_link_ops)
6507                                 dev->rtnl_link_ops->dellink(dev, &dev_kill_list);
6508                         else
6509                                 unregister_netdevice_queue(dev, &dev_kill_list);
6510                 }
6511         }
6512         unregister_netdevice_many(&dev_kill_list);
6513         list_del(&dev_kill_list);
6514         rtnl_unlock();
6515 }
6516
6517 static struct pernet_operations __net_initdata default_device_ops = {
6518         .exit = default_device_exit,
6519         .exit_batch = default_device_exit_batch,
6520 };
6521
6522 /*
6523  *      Initialize the DEV module. At boot time this walks the device list and
6524  *      unhooks any devices that fail to initialise (normally hardware not
6525  *      present) and leaves us with a valid list of present and active devices.
6526  *
6527  */
6528
6529 /*
6530  *       This is called single threaded during boot, so no need
6531  *       to take the rtnl semaphore.
6532  */
6533 static int __init net_dev_init(void)
6534 {
6535         int i, rc = -ENOMEM;
6536
6537         BUG_ON(!dev_boot_phase);
6538
6539         if (dev_proc_init())
6540                 goto out;
6541
6542         if (netdev_kobject_init())
6543                 goto out;
6544
6545         INIT_LIST_HEAD(&ptype_all);
6546         for (i = 0; i < PTYPE_HASH_SIZE; i++)
6547                 INIT_LIST_HEAD(&ptype_base[i]);
6548
6549         if (register_pernet_subsys(&netdev_net_ops))
6550                 goto out;
6551
6552         /*
6553          *      Initialise the packet receive queues.
6554          */
6555
6556         for_each_possible_cpu(i) {
6557                 struct softnet_data *sd = &per_cpu(softnet_data, i);
6558
6559                 memset(sd, 0, sizeof(*sd));
6560                 skb_queue_head_init(&sd->input_pkt_queue);
6561                 skb_queue_head_init(&sd->process_queue);
6562                 sd->completion_queue = NULL;
6563                 INIT_LIST_HEAD(&sd->poll_list);
6564                 sd->output_queue = NULL;
6565                 sd->output_queue_tailp = &sd->output_queue;
6566 #ifdef CONFIG_RPS
6567                 sd->csd.func = rps_trigger_softirq;
6568                 sd->csd.info = sd;
6569                 sd->csd.flags = 0;
6570                 sd->cpu = i;
6571 #endif
6572
6573                 sd->backlog.poll = process_backlog;
6574                 sd->backlog.weight = weight_p;
6575                 sd->backlog.gro_list = NULL;
6576                 sd->backlog.gro_count = 0;
6577         }
6578
6579         dev_boot_phase = 0;
6580
6581         /* The loopback device is special if any other network devices
6582          * is present in a network namespace the loopback device must
6583          * be present. Since we now dynamically allocate and free the
6584          * loopback device ensure this invariant is maintained by
6585          * keeping the loopback device as the first device on the
6586          * list of network devices.  Ensuring the loopback devices
6587          * is the first device that appears and the last network device
6588          * that disappears.
6589          */
6590         if (register_pernet_device(&loopback_net_ops))
6591                 goto out;
6592
6593         if (register_pernet_device(&default_device_ops))
6594                 goto out;
6595
6596         open_softirq(NET_TX_SOFTIRQ, net_tx_action);
6597         open_softirq(NET_RX_SOFTIRQ, net_rx_action);
6598
6599         hotcpu_notifier(dev_cpu_callback, 0);
6600         dst_init();
6601         dev_mcast_init();
6602         rc = 0;
6603 out:
6604         return rc;
6605 }
6606
6607 subsys_initcall(net_dev_init);
6608
6609 static int __init initialize_hashrnd(void)
6610 {
6611         get_random_bytes(&hashrnd, sizeof(hashrnd));
6612         return 0;
6613 }
6614
6615 late_initcall_sync(initialize_hashrnd);
6616