net/core/dev.c

   1 /*
   2  *      NET3    Protocol independent device support routines.
   3  *
   4  *              This program is free software; you can redistribute it and/or
   5  *              modify it under the terms of the GNU General Public License
   6  *              as published by the Free Software Foundation; either version
   7  *              2 of the License, or (at your option) any later version.
   8  *
   9  *      Derived from the non IP parts of dev.c 1.0.19
  10  *              Authors:        Ross Biro
  11  *                              Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
  12  *                              Mark Evans, <evansmp@uhura.aston.ac.uk>
  13  *
  14  *      Additional Authors:
  15  *              Florian la Roche <rzsfl@rz.uni-sb.de>
  16  *              Alan Cox <gw4pts@gw4pts.ampr.org>
  17  *              David Hinds <dahinds@users.sourceforge.net>
  18  *              Alexey Kuznetsov <kuznet@ms2.inr.ac.ru>
  19  *              Adam Sulmicki <adam@cfar.umd.edu>
  20  *              Pekka Riikonen <priikone@poesidon.pspt.fi>
  21  *
  22  *      Changes:
  23  *              D.J. Barrow     :       Fixed bug where dev->refcnt gets set
  24  *                                      to 2 if register_netdev gets called
  25  *                                      before net_dev_init & also removed a
  26  *                                      few lines of code in the process.
  27  *              Alan Cox        :       device private ioctl copies fields back.
  28  *              Alan Cox        :       Transmit queue code does relevant
  29  *                                      stunts to keep the queue safe.
  30  *              Alan Cox        :       Fixed double lock.
  31  *              Alan Cox        :       Fixed promisc NULL pointer trap
  32  *              ????????        :       Support the full private ioctl range
  33  *              Alan Cox        :       Moved ioctl permission check into
  34  *                                      drivers
  35  *              Tim Kordas      :       SIOCADDMULTI/SIOCDELMULTI
  36  *              Alan Cox        :       100 backlog just doesn't cut it when
  37  *                                      you start doing multicast video 8)
  38  *              Alan Cox        :       Rewrote net_bh and list manager.
  39  *              Alan Cox        :       Fix ETH_P_ALL echoback lengths.
  40  *              Alan Cox        :       Took out transmit every packet pass
  41  *                                      Saved a few bytes in the ioctl handler
  42  *              Alan Cox        :       Network driver sets packet type before
  43  *                                      calling netif_rx. Saves a function
  44  *                                      call a packet.
  45  *              Alan Cox        :       Hashed net_bh()
  46  *              Richard Kooijman:       Timestamp fixes.
  47  *              Alan Cox        :       Wrong field in SIOCGIFDSTADDR
  48  *              Alan Cox        :       Device lock protection.
  49  *              Alan Cox        :       Fixed nasty side effect of device close
  50  *                                      changes.
  51  *              Rudi Cilibrasi  :       Pass the right thing to
  52  *                                      set_mac_address()
  53  *              Dave Miller     :       32bit quantity for the device lock to
  54  *                                      make it work out on a Sparc.
  55  *              Bjorn Ekwall    :       Added KERNELD hack.
  56  *              Alan Cox        :       Cleaned up the backlog initialise.
  57  *              Craig Metz      :       SIOCGIFCONF fix if space for under
  58  *                                      1 device.
  59  *          Thomas Bogendoerfer :       Return ENODEV for dev_open, if there
  60  *                                      is no device open function.
  61  *              Andi Kleen      :       Fix error reporting for SIOCGIFCONF
  62  *          Michael Chastain    :       Fix signed/unsigned for SIOCGIFCONF
  63  *              Cyrus Durgin    :       Cleaned for KMOD
  64  *              Adam Sulmicki   :       Bug Fix : Network Device Unload
  65  *                                      A network device unload needs to purge
  66  *                                      the backlog queue.
  67  *      Paul Rusty Russell      :       SIOCSIFNAME
  68  *              Pekka Riikonen  :       Netdev boot-time settings code
  69  *              Andrew Morton   :       Make unregister_netdevice wait
  70  *                                      indefinitely on dev->refcnt
  71  *              J Hadi Salim    :       - Backlog queue sampling
  72  *                                      - netif_rx() feedback
  73  */
  74
  75 #include <asm/uaccess.h>
  76 #include <asm/system.h>
  77 #include <linux/bitops.h>
  78 #include <linux/capability.h>
  79 #include <linux/cpu.h>
  80 #include <linux/types.h>
  81 #include <linux/kernel.h>
  82 #include <linux/sched.h>
  83 #include <linux/mutex.h>
  84 #include <linux/string.h>
  85 #include <linux/mm.h>
  86 #include <linux/socket.h>
  87 #include <linux/sockios.h>
  88 #include <linux/errno.h>
  89 #include <linux/interrupt.h>
  90 #include <linux/if_ether.h>
  91 #include <linux/netdevice.h>
  92 #include <linux/etherdevice.h>
  93 #include <linux/ethtool.h>
  94 #include <linux/notifier.h>
  95 #include <linux/skbuff.h>
  96 #include <net/net_namespace.h>
  97 #include <net/sock.h>
  98 #include <linux/rtnetlink.h>
  99 #if defined(CONFIG_IMQ) || defined(CONFIG_IMQ_MODULE)
 100 #include <linux/imq.h>
 101 #endif
 102 #include <linux/proc_fs.h>
 103 #include <linux/seq_file.h>
 104 #include <linux/stat.h>
 105 #include <linux/if_bridge.h>
 106 #include <linux/if_macvlan.h>
 107 #include <net/dst.h>
 108 #include <net/pkt_sched.h>
 109 #include <net/checksum.h>
 110 #include <linux/highmem.h>
 111 #include <linux/init.h>
 112 #include <linux/kmod.h>
 113 #include <linux/module.h>
 114 #include <linux/netpoll.h>
 115 #include <linux/rcupdate.h>
 116 #include <linux/delay.h>
 117 #include <net/wext.h>
 118 #include <net/iw_handler.h>
 119 #include <asm/current.h>
 120 #include <linux/audit.h>
 121 #include <linux/dmaengine.h>
 122 #include <linux/err.h>
 123 #include <linux/ctype.h>
 124 #include <linux/if_arp.h>
 125 #include <linux/if_vlan.h>
 126 #include <linux/ip.h>
 127 #include <net/ip.h>
 128 #include <linux/ipv6.h>
 129 #include <linux/in.h>
 130 #include <linux/jhash.h>
 131 #include <linux/random.h>
 132 #include <trace/events/napi.h>
 133
 134 #include "net-sysfs.h"
 135
 136 /* Instead of increasing this, you should create a hash table. */
 137 #define MAX_GRO_SKBS 8
 138
 139 /* This should be increased if a protocol with a bigger head is added. */
 140 #define GRO_MAX_HEAD (MAX_HEADER + 128)
 141
 142 /*
 143  *      The list of packet types we will receive (as opposed to discard)
 144  *      and the routines to invoke.
 145  *
 146  *      Why 16. Because with 16 the only overlap we get on a hash of the
 147  *      low nibble of the protocol value is RARP/SNAP/X.25.
 148  *
 149  *      NOTE:  That is no longer true with the addition of VLAN tags.  Not
 150  *             sure which should go first, but I bet it won't make much
 151  *             difference if we are running VLANs.  The good news is that
 152  *             this protocol won't be in the list unless compiled in, so
 153  *             the average user (w/out VLANs) will not be adversely affected.
 154  *             --BLG
 155  *
 156  *              0800    IP
 157  *              8100    802.1Q VLAN
 158  *              0001    802.3
 159  *              0002    AX.25
 160  *              0004    802.2
 161  *              8035    RARP
 162  *              0005    SNAP
 163  *              0805    X.25
 164  *              0806    ARP
 165  *              8137    IPX
 166  *              0009    Localtalk
 167  *              86DD    IPv6
 168  */
 169
 170 #define PTYPE_HASH_SIZE (16)
 171 #define PTYPE_HASH_MASK (PTYPE_HASH_SIZE - 1)
 172
 173 static DEFINE_SPINLOCK(ptype_lock);
 174 static struct list_head ptype_base[PTYPE_HASH_SIZE] __read_mostly;
 175 static struct list_head ptype_all __read_mostly;        /* Taps */
 176
 177 /*
 178  * The @dev_base_head list is protected by @dev_base_lock and the rtnl
 179  * semaphore.
 180  *
 181  * Pure readers hold dev_base_lock for reading.
 182  *
 183  * Writers must hold the rtnl semaphore while they loop through the
 184  * dev_base_head list, and hold dev_base_lock for writing when they do the
 185  * actual updates.  This allows pure readers to access the list even
 186  * while a writer is preparing to update it.
 187  *
 188  * To put it another way, dev_base_lock is held for writing only to
 189  * protect against pure readers; the rtnl semaphore provides the
 190  * protection against other writers.
 191  *
 192  * See, for example usages, register_netdevice() and
 193  * unregister_netdevice(), which must be called with the rtnl
 194  * semaphore held.
 195  */
 196 DEFINE_RWLOCK(dev_base_lock);
 197 EXPORT_SYMBOL(dev_base_lock);
 198
 199 #define NETDEV_HASHBITS 8
 200 #define NETDEV_HASHENTRIES (1 << NETDEV_HASHBITS)
 201
 202 static inline struct hlist_head *dev_name_hash(struct net *net, const char *name)
 203 {
 204         unsigned hash = full_name_hash(name, strnlen(name, IFNAMSIZ));
 205         return &net->dev_name_head[hash & ((1 << NETDEV_HASHBITS) - 1)];
 206 }
 207
 208 static inline struct hlist_head *dev_index_hash(struct net *net, int ifindex)
 209 {
 210         return &net->dev_index_head[ifindex & ((1 << NETDEV_HASHBITS) - 1)];
 211 }
 212
 213 /* Device list insertion */
 214 static int list_netdevice(struct net_device *dev)
 215 {
 216         struct net *net = dev_net(dev);
 217
 218         ASSERT_RTNL();
 219
 220         write_lock_bh(&dev_base_lock);
 221         list_add_tail(&dev->dev_list, &net->dev_base_head);
 222         hlist_add_head(&dev->name_hlist, dev_name_hash(net, dev->name));
 223         hlist_add_head(&dev->index_hlist, dev_index_hash(net, dev->ifindex));
 224         write_unlock_bh(&dev_base_lock);
 225         return 0;
 226 }
 227
 228 /* Device list removal */
 229 static void unlist_netdevice(struct net_device *dev)
 230 {
 231         ASSERT_RTNL();
 232
 233         /* Unlink dev from the device chain */
 234         write_lock_bh(&dev_base_lock);
 235         list_del(&dev->dev_list);
 236         hlist_del(&dev->name_hlist);
 237         hlist_del(&dev->index_hlist);
 238         write_unlock_bh(&dev_base_lock);
 239 }
 240
 241 /*
 242  *      Our notifier list
 243  */
 244
 245 static RAW_NOTIFIER_HEAD(netdev_chain);
 246
 247 /*
 248  *      Device drivers call our routines to queue packets here. We empty the
 249  *      queue in the local softnet handler.
 250  */
 251
 252 DEFINE_PER_CPU(struct softnet_data, softnet_data);
 253 EXPORT_PER_CPU_SYMBOL(softnet_data);
 254
 255 #ifdef CONFIG_LOCKDEP
 256 /*
 257  * register_netdevice() inits txq->_xmit_lock and sets lockdep class
 258  * according to dev->type
 259  */
 260 static const unsigned short netdev_lock_type[] =
 261         {ARPHRD_NETROM, ARPHRD_ETHER, ARPHRD_EETHER, ARPHRD_AX25,
 262          ARPHRD_PRONET, ARPHRD_CHAOS, ARPHRD_IEEE802, ARPHRD_ARCNET,
 263          ARPHRD_APPLETLK, ARPHRD_DLCI, ARPHRD_ATM, ARPHRD_METRICOM,
 264          ARPHRD_IEEE1394, ARPHRD_EUI64, ARPHRD_INFINIBAND, ARPHRD_SLIP,
 265          ARPHRD_CSLIP, ARPHRD_SLIP6, ARPHRD_CSLIP6, ARPHRD_RSRVD,
 266          ARPHRD_ADAPT, ARPHRD_ROSE, ARPHRD_X25, ARPHRD_HWX25,
 267          ARPHRD_PPP, ARPHRD_CISCO, ARPHRD_LAPB, ARPHRD_DDCMP,
 268          ARPHRD_RAWHDLC, ARPHRD_TUNNEL, ARPHRD_TUNNEL6, ARPHRD_FRAD,
 269          ARPHRD_SKIP, ARPHRD_LOOPBACK, ARPHRD_LOCALTLK, ARPHRD_FDDI,
 270          ARPHRD_BIF, ARPHRD_SIT, ARPHRD_IPDDP, ARPHRD_IPGRE,
 271          ARPHRD_PIMREG, ARPHRD_HIPPI, ARPHRD_ASH, ARPHRD_ECONET,
 272          ARPHRD_IRDA, ARPHRD_FCPP, ARPHRD_FCAL, ARPHRD_FCPL,
 273          ARPHRD_FCFABRIC, ARPHRD_IEEE802_TR, ARPHRD_IEEE80211,
 274          ARPHRD_IEEE80211_PRISM, ARPHRD_IEEE80211_RADIOTAP, ARPHRD_PHONET,
 275          ARPHRD_PHONET_PIPE, ARPHRD_IEEE802154,
 276          ARPHRD_VOID, ARPHRD_NONE};
 277
 278 static const char *const netdev_lock_name[] =
 279         {"_xmit_NETROM", "_xmit_ETHER", "_xmit_EETHER", "_xmit_AX25",
 280          "_xmit_PRONET", "_xmit_CHAOS", "_xmit_IEEE802", "_xmit_ARCNET",
 281          "_xmit_APPLETLK", "_xmit_DLCI", "_xmit_ATM", "_xmit_METRICOM",
 282          "_xmit_IEEE1394", "_xmit_EUI64", "_xmit_INFINIBAND", "_xmit_SLIP",
 283          "_xmit_CSLIP", "_xmit_SLIP6", "_xmit_CSLIP6", "_xmit_RSRVD",
 284          "_xmit_ADAPT", "_xmit_ROSE", "_xmit_X25", "_xmit_HWX25",
 285          "_xmit_PPP", "_xmit_CISCO", "_xmit_LAPB", "_xmit_DDCMP",
 286          "_xmit_RAWHDLC", "_xmit_TUNNEL", "_xmit_TUNNEL6", "_xmit_FRAD",
 287          "_xmit_SKIP", "_xmit_LOOPBACK", "_xmit_LOCALTLK", "_xmit_FDDI",
 288          "_xmit_BIF", "_xmit_SIT", "_xmit_IPDDP", "_xmit_IPGRE",
 289          "_xmit_PIMREG", "_xmit_HIPPI", "_xmit_ASH", "_xmit_ECONET",
 290          "_xmit_IRDA", "_xmit_FCPP", "_xmit_FCAL", "_xmit_FCPL",
 291          "_xmit_FCFABRIC", "_xmit_IEEE802_TR", "_xmit_IEEE80211",
 292          "_xmit_IEEE80211_PRISM", "_xmit_IEEE80211_RADIOTAP", "_xmit_PHONET",
 293          "_xmit_PHONET_PIPE", "_xmit_IEEE802154",
 294          "_xmit_VOID", "_xmit_NONE"};
 295
 296 static struct lock_class_key netdev_xmit_lock_key[ARRAY_SIZE(netdev_lock_type)];
 297 static struct lock_class_key netdev_addr_lock_key[ARRAY_SIZE(netdev_lock_type)];
 298
 299 static inline unsigned short netdev_lock_pos(unsigned short dev_type)
 300 {
 301         int i;
 302
 303         for (i = 0; i < ARRAY_SIZE(netdev_lock_type); i++)
 304                 if (netdev_lock_type[i] == dev_type)
 305                         return i;
 306         /* the last key is used by default */
 307         return ARRAY_SIZE(netdev_lock_type) - 1;
 308 }
 309
 310 static inline void netdev_set_xmit_lockdep_class(spinlock_t *lock,
 311                                                  unsigned short dev_type)
 312 {
 313         int i;
 314
 315         i = netdev_lock_pos(dev_type);
 316         lockdep_set_class_and_name(lock, &netdev_xmit_lock_key[i],
 317                                    netdev_lock_name[i]);
 318 }
 319
 320 static inline void netdev_set_addr_lockdep_class(struct net_device *dev)
 321 {
 322         int i;
 323
 324         i = netdev_lock_pos(dev->type);
 325         lockdep_set_class_and_name(&dev->addr_list_lock,
 326                                    &netdev_addr_lock_key[i],
 327                                    netdev_lock_name[i]);
 328 }
 329 #else
 330 static inline void netdev_set_xmit_lockdep_class(spinlock_t *lock,
 331                                                  unsigned short dev_type)
 332 {
 333 }
 334 static inline void netdev_set_addr_lockdep_class(struct net_device *dev)
 335 {
 336 }
 337 #endif
 338
 339 /*******************************************************************************
 340
 341                 Protocol management and registration routines
 342
 343 *******************************************************************************/
 344
 345 /*
 346  *      Add a protocol ID to the list. Now that the input handler is
 347  *      smarter we can dispense with all the messy stuff that used to be
 348  *      here.
 349  *
 350  *      BEWARE!!! Protocol handlers, mangling input packets,
 351  *      MUST BE last in hash buckets and checking protocol handlers
 352  *      MUST start from promiscuous ptype_all chain in net_bh.
 353  *      It is true now, do not change it.
 354  *      Explanation follows: if protocol handler, mangling packet, will
 355  *      be the first on list, it is not able to sense, that packet
 356  *      is cloned and should be copied-on-write, so that it will
 357  *      change it and subsequent readers will get broken packet.
 358  *                                                      --ANK (980803)
 359  */
 360
 361 /**
 362  *      dev_add_pack - add packet handler
 363  *      @pt: packet type declaration
 364  *
 365  *      Add a protocol handler to the networking stack. The passed &packet_type
 366  *      is linked into kernel lists and may not be freed until it has been
 367  *      removed from the kernel lists.
 368  *
 369  *      This call does not sleep therefore it can not
 370  *      guarantee all CPU's that are in middle of receiving packets
 371  *      will see the new packet type (until the next received packet).
 372  */
 373
 374 void dev_add_pack(struct packet_type *pt)
 375 {
 376         int hash;
 377
 378         spin_lock_bh(&ptype_lock);
 379         if (pt->type == htons(ETH_P_ALL))
 380                 list_add_rcu(&pt->list, &ptype_all);
 381         else {
 382                 hash = ntohs(pt->type) & PTYPE_HASH_MASK;
 383                 list_add_rcu(&pt->list, &ptype_base[hash]);
 384         }
 385         spin_unlock_bh(&ptype_lock);
 386 }
 387 EXPORT_SYMBOL(dev_add_pack);
 388
 389 /**
 390  *      __dev_remove_pack        - remove packet handler
 391  *      @pt: packet type declaration
 392  *
 393  *      Remove a protocol handler that was previously added to the kernel
 394  *      protocol handlers by dev_add_pack(). The passed &packet_type is removed
 395  *      from the kernel lists and can be freed or reused once this function
 396  *      returns.
 397  *
 398  *      The packet type might still be in use by receivers
 399  *      and must not be freed until after all the CPU's have gone
 400  *      through a quiescent state.
 401  */
 402 void __dev_remove_pack(struct packet_type *pt)
 403 {
 404         struct list_head *head;
 405         struct packet_type *pt1;
 406
 407         spin_lock_bh(&ptype_lock);
 408
 409         if (pt->type == htons(ETH_P_ALL))
 410                 head = &ptype_all;
 411         else
 412                 head = &ptype_base[ntohs(pt->type) & PTYPE_HASH_MASK];
 413
 414         list_for_each_entry(pt1, head, list) {
 415                 if (pt == pt1) {
 416                         list_del_rcu(&pt->list);
 417                         goto out;
 418                 }
 419         }
 420
 421         printk(KERN_WARNING "dev_remove_pack: %p not found.\n", pt);
 422 out:
 423         spin_unlock_bh(&ptype_lock);
 424 }
 425 EXPORT_SYMBOL(__dev_remove_pack);
 426
 427 /**
 428  *      dev_remove_pack  - remove packet handler
 429  *      @pt: packet type declaration
 430  *
 431  *      Remove a protocol handler that was previously added to the kernel
 432  *      protocol handlers by dev_add_pack(). The passed &packet_type is removed
 433  *      from the kernel lists and can be freed or reused once this function
 434  *      returns.
 435  *
 436  *      This call sleeps to guarantee that no CPU is looking at the packet
 437  *      type after return.
 438  */
 439 void dev_remove_pack(struct packet_type *pt)
 440 {
 441         __dev_remove_pack(pt);
 442
 443         synchronize_net();
 444 }
 445 EXPORT_SYMBOL(dev_remove_pack);
 446
 447 /******************************************************************************
 448
 449                       Device Boot-time Settings Routines
 450
 451 *******************************************************************************/
 452
 453 /* Boot time configuration table */
 454 static struct netdev_boot_setup dev_boot_setup[NETDEV_BOOT_SETUP_MAX];
 455
 456 /**
 457  *      netdev_boot_setup_add   - add new setup entry
 458  *      @name: name of the device
 459  *      @map: configured settings for the device
 460  *
 461  *      Adds new setup entry to the dev_boot_setup list.  The function
 462  *      returns 0 on error and 1 on success.  This is a generic routine to
 463  *      all netdevices.
 464  */
 465 static int netdev_boot_setup_add(char *name, struct ifmap *map)
 466 {
 467         struct netdev_boot_setup *s;
 468         int i;
 469
 470         s = dev_boot_setup;
 471         for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++) {
 472                 if (s[i].name[0] == '\0' || s[i].name[0] == ' ') {
 473                         memset(s[i].name, 0, sizeof(s[i].name));
 474                         strlcpy(s[i].name, name, IFNAMSIZ);
 475                         memcpy(&s[i].map, map, sizeof(s[i].map));
 476                         break;
 477                 }
 478         }
 479
 480         return i >= NETDEV_BOOT_SETUP_MAX ? 0 : 1;
 481 }
 482
 483 /**
 484  *      netdev_boot_setup_check - check boot time settings
 485  *      @dev: the netdevice
 486  *
 487  *      Check boot time settings for the device.
 488  *      The found settings are set for the device to be used
 489  *      later in the device probing.
 490  *      Returns 0 if no settings found, 1 if they are.
 491  */
 492 int netdev_boot_setup_check(struct net_device *dev)
 493 {
 494         struct netdev_boot_setup *s = dev_boot_setup;
 495         int i;
 496
 497         for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++) {
 498                 if (s[i].name[0] != '\0' && s[i].name[0] != ' ' &&
 499                     !strcmp(dev->name, s[i].name)) {
 500                         dev->irq        = s[i].map.irq;
 501                         dev->base_addr  = s[i].map.base_addr;
 502                         dev->mem_start  = s[i].map.mem_start;
 503                         dev->mem_end    = s[i].map.mem_end;
 504                         return 1;
 505                 }
 506         }
 507         return 0;
 508 }
 509 EXPORT_SYMBOL(netdev_boot_setup_check);
 510
 511
 512 /**
 513  *      netdev_boot_base        - get address from boot time settings
 514  *      @prefix: prefix for network device
 515  *      @unit: id for network device
 516  *
 517  *      Check boot time settings for the base address of device.
 518  *      The found settings are set for the device to be used
 519  *      later in the device probing.
 520  *      Returns 0 if no settings found.
 521  */
 522 unsigned long netdev_boot_base(const char *prefix, int unit)
 523 {
 524         const struct netdev_boot_setup *s = dev_boot_setup;
 525         char name[IFNAMSIZ];
 526         int i;
 527
 528         sprintf(name, "%s%d", prefix, unit);
 529
 530         /*
 531          * If device already registered then return base of 1
 532          * to indicate not to probe for this interface
 533          */
 534         if (__dev_get_by_name(&init_net, name))
 535                 return 1;
 536
 537         for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++)
 538                 if (!strcmp(name, s[i].name))
 539                         return s[i].map.base_addr;
 540         return 0;
 541 }
 542
 543 /*
 544  * Saves at boot time configured settings for any netdevice.
 545  */
 546 int __init netdev_boot_setup(char *str)
 547 {
 548         int ints[5];
 549         struct ifmap map;
 550
 551         str = get_options(str, ARRAY_SIZE(ints), ints);
 552         if (!str || !*str)
 553                 return 0;
 554
 555         /* Save settings */
 556         memset(&map, 0, sizeof(map));
 557         if (ints[0] > 0)
 558                 map.irq = ints[1];
 559         if (ints[0] > 1)
 560                 map.base_addr = ints[2];
 561         if (ints[0] > 2)
 562                 map.mem_start = ints[3];
 563         if (ints[0] > 3)
 564                 map.mem_end = ints[4];
 565
 566         /* Add new entry to the list */
 567         return netdev_boot_setup_add(str, &map);
 568 }
 569
 570 __setup("netdev=", netdev_boot_setup);
 571
 572 /*******************************************************************************
 573
 574                             Device Interface Subroutines
 575
 576 *******************************************************************************/
 577
 578 /**
 579  *      __dev_get_by_name       - find a device by its name
 580  *      @net: the applicable net namespace
 581  *      @name: name to find
 582  *
 583  *      Find an interface by name. Must be called under RTNL semaphore
 584  *      or @dev_base_lock. If the name is found a pointer to the device
 585  *      is returned. If the name is not found then %NULL is returned. The
 586  *      reference counters are not incremented so the caller must be
 587  *      careful with locks.
 588  */
 589
 590 struct net_device *__dev_get_by_name(struct net *net, const char *name)
 591 {
 592         struct hlist_node *p;
 593
 594         hlist_for_each(p, dev_name_hash(net, name)) {
 595                 struct net_device *dev
 596                         = hlist_entry(p, struct net_device, name_hlist);
 597                 if (!strncmp(dev->name, name, IFNAMSIZ))
 598                         return dev;
 599         }
 600         return NULL;
 601 }
 602 EXPORT_SYMBOL(__dev_get_by_name);
 603
 604 /**
 605  *      dev_get_by_name         - find a device by its name
 606  *      @net: the applicable net namespace
 607  *      @name: name to find
 608  *
 609  *      Find an interface by name. This can be called from any
 610  *      context and does its own locking. The returned handle has
 611  *      the usage count incremented and the caller must use dev_put() to
 612  *      release it when it is no longer needed. %NULL is returned if no
 613  *      matching device is found.
 614  */
 615
 616 struct net_device *dev_get_by_name(struct net *net, const char *name)
 617 {
 618         struct net_device *dev;
 619
 620         read_lock(&dev_base_lock);
 621         dev = __dev_get_by_name(net, name);
 622         if (dev)
 623                 dev_hold(dev);
 624         read_unlock(&dev_base_lock);
 625         return dev;
 626 }
 627 EXPORT_SYMBOL(dev_get_by_name);
 628
 629 /**
 630  *      __dev_get_by_index - find a device by its ifindex
 631  *      @net: the applicable net namespace
 632  *      @ifindex: index of device
 633  *
 634  *      Search for an interface by index. Returns %NULL if the device
 635  *      is not found or a pointer to the device. The device has not
 636  *      had its reference counter increased so the caller must be careful
 637  *      about locking. The caller must hold either the RTNL semaphore
 638  *      or @dev_base_lock.
 639  */
 640
 641 struct net_device *__dev_get_by_index(struct net *net, int ifindex)
 642 {
 643         struct hlist_node *p;
 644
 645         hlist_for_each(p, dev_index_hash(net, ifindex)) {
 646                 struct net_device *dev
 647                         = hlist_entry(p, struct net_device, index_hlist);
 648                 if (dev->ifindex == ifindex)
 649                         return dev;
 650         }
 651         return NULL;
 652 }
 653 EXPORT_SYMBOL(__dev_get_by_index);
 654
 655
 656 /**
 657  *      dev_get_by_index - find a device by its ifindex
 658  *      @net: the applicable net namespace
 659  *      @ifindex: index of device
 660  *
 661  *      Search for an interface by index. Returns NULL if the device
 662  *      is not found or a pointer to the device. The device returned has
 663  *      had a reference added and the pointer is safe until the user calls
 664  *      dev_put to indicate they have finished with it.
 665  */
 666
 667 struct net_device *dev_get_by_index(struct net *net, int ifindex)
 668 {
 669         struct net_device *dev;
 670
 671         read_lock(&dev_base_lock);
 672         dev = __dev_get_by_index(net, ifindex);
 673         if (dev)
 674                 dev_hold(dev);
 675         read_unlock(&dev_base_lock);
 676         return dev;
 677 }
 678 EXPORT_SYMBOL(dev_get_by_index);
 679
 680 /**
 681  *      dev_getbyhwaddr - find a device by its hardware address
 682  *      @net: the applicable net namespace
 683  *      @type: media type of device
 684  *      @ha: hardware address
 685  *
 686  *      Search for an interface by MAC address. Returns NULL if the device
 687  *      is not found or a pointer to the device. The caller must hold the
 688  *      rtnl semaphore. The returned device has not had its ref count increased
 689  *      and the caller must therefore be careful about locking
 690  *
 691  *      BUGS:
 692  *      If the API was consistent this would be __dev_get_by_hwaddr
 693  */
 694
 695 struct net_device *dev_getbyhwaddr(struct net *net, unsigned short type, char *ha)
 696 {
 697         struct net_device *dev;
 698
 699         ASSERT_RTNL();
 700
 701         for_each_netdev(net, dev)
 702                 if (dev->type == type &&
 703                     !memcmp(dev->dev_addr, ha, dev->addr_len))
 704                         return dev;
 705
 706         return NULL;
 707 }
 708 EXPORT_SYMBOL(dev_getbyhwaddr);
 709
 710 struct net_device *__dev_getfirstbyhwtype(struct net *net, unsigned short type)
 711 {
 712         struct net_device *dev;
 713
 714         ASSERT_RTNL();
 715         for_each_netdev(net, dev)
 716                 if (dev->type == type)
 717                         return dev;
 718
 719         return NULL;
 720 }
 721 EXPORT_SYMBOL(__dev_getfirstbyhwtype);
 722
 723 struct net_device *dev_getfirstbyhwtype(struct net *net, unsigned short type)
 724 {
 725         struct net_device *dev;
 726
 727         rtnl_lock();
 728         dev = __dev_getfirstbyhwtype(net, type);
 729         if (dev)
 730                 dev_hold(dev);
 731         rtnl_unlock();
 732         return dev;
 733 }
 734 EXPORT_SYMBOL(dev_getfirstbyhwtype);
 735
 736 /**
 737  *      dev_get_by_flags - find any device with given flags
 738  *      @net: the applicable net namespace
 739  *      @if_flags: IFF_* values
 740  *      @mask: bitmask of bits in if_flags to check
 741  *
 742  *      Search for any interface with the given flags. Returns NULL if a device
 743  *      is not found or a pointer to the device. The device returned has
 744  *      had a reference added and the pointer is safe until the user calls
 745  *      dev_put to indicate they have finished with it.
 746  */
 747
 748 struct net_device *dev_get_by_flags(struct net *net, unsigned short if_flags,
 749                                     unsigned short mask)
 750 {
 751         struct net_device *dev, *ret;
 752
 753         ret = NULL;
 754         read_lock(&dev_base_lock);
 755         for_each_netdev(net, dev) {
 756                 if (((dev->flags ^ if_flags) & mask) == 0) {
 757                         dev_hold(dev);
 758                         ret = dev;
 759                         break;
 760                 }
 761         }
 762         read_unlock(&dev_base_lock);
 763         return ret;
 764 }
 765 EXPORT_SYMBOL(dev_get_by_flags);
 766
 767 /**
 768  *      dev_valid_name - check if name is okay for network device
 769  *      @name: name string
 770  *
 771  *      Network device names need to be valid file names to
 772  *      to allow sysfs to work.  We also disallow any kind of
 773  *      whitespace.
 774  */
 775 int dev_valid_name(const char *name)
 776 {
 777         if (*name == '\0')
 778                 return 0;
 779         if (strlen(name) >= IFNAMSIZ)
 780                 return 0;
 781         if (!strcmp(name, ".") || !strcmp(name, ".."))
 782                 return 0;
 783
 784         while (*name) {
 785                 if (*name == '/' || isspace(*name))
 786                         return 0;
 787                 name++;
 788         }
 789         return 1;
 790 }
 791 EXPORT_SYMBOL(dev_valid_name);
 792
 793 /**
 794  *      __dev_alloc_name - allocate a name for a device
 795  *      @net: network namespace to allocate the device name in
 796  *      @name: name format string
 797  *      @buf:  scratch buffer and result name string
 798  *
 799  *      Passed a format string - eg "lt%d" it will try and find a suitable
 800  *      id. It scans list of devices to build up a free map, then chooses
 801  *      the first empty slot. The caller must hold the dev_base or rtnl lock
 802  *      while allocating the name and adding the device in order to avoid
 803  *      duplicates.
 804  *      Limited to bits_per_byte * page size devices (ie 32K on most platforms).
 805  *      Returns the number of the unit assigned or a negative errno code.
 806  */
 807
 808 static int __dev_alloc_name(struct net *net, const char *name, char *buf)
 809 {
 810         int i = 0;
 811         const char *p;
 812         const int max_netdevices = 8*PAGE_SIZE;
 813         unsigned long *inuse;
 814         struct net_device *d;
 815
 816         p = strnchr(name, IFNAMSIZ-1, '%');
 817         if (p) {
 818                 /*
 819                  * Verify the string as this thing may have come from
 820                  * the user.  There must be either one "%d" and no other "%"
 821                  * characters.
 822                  */
 823                 if (p[1] != 'd' || strchr(p + 2, '%'))
 824                         return -EINVAL;
 825
 826                 /* Use one page as a bit array of possible slots */
 827                 inuse = (unsigned long *) get_zeroed_page(GFP_ATOMIC);
 828                 if (!inuse)
 829                         return -ENOMEM;
 830
 831                 for_each_netdev(net, d) {
 832                         if (!sscanf(d->name, name, &i))
 833                                 continue;
 834                         if (i < 0 || i >= max_netdevices)
 835                                 continue;
 836
 837                         /*  avoid cases where sscanf is not exact inverse of printf */
 838                         snprintf(buf, IFNAMSIZ, name, i);
 839                         if (!strncmp(buf, d->name, IFNAMSIZ))
 840                                 set_bit(i, inuse);
 841                 }
 842
 843                 i = find_first_zero_bit(inuse, max_netdevices);
 844                 free_page((unsigned long) inuse);
 845         }
 846
 847         snprintf(buf, IFNAMSIZ, name, i);
 848         if (!__dev_get_by_name(net, buf))
 849                 return i;
 850
 851         /* It is possible to run out of possible slots
 852          * when the name is long and there isn't enough space left
 853          * for the digits, or if all bits are used.
 854          */
 855         return -ENFILE;
 856 }
 857
 858 /**
 859  *      dev_alloc_name - allocate a name for a device
 860  *      @dev: device
 861  *      @name: name format string
 862  *
 863  *      Passed a format string - eg "lt%d" it will try and find a suitable
 864  *      id. It scans list of devices to build up a free map, then chooses
 865  *      the first empty slot. The caller must hold the dev_base or rtnl lock
 866  *      while allocating the name and adding the device in order to avoid
 867  *      duplicates.
 868  *      Limited to bits_per_byte * page size devices (ie 32K on most platforms).
 869  *      Returns the number of the unit assigned or a negative errno code.
 870  */
 871
 872 int dev_alloc_name(struct net_device *dev, const char *name)
 873 {
 874         char buf[IFNAMSIZ];
 875         struct net *net;
 876         int ret;
 877
 878         BUG_ON(!dev_net(dev));
 879         net = dev_net(dev);
 880         ret = __dev_alloc_name(net, name, buf);
 881         if (ret >= 0)
 882                 strlcpy(dev->name, buf, IFNAMSIZ);
 883         return ret;
 884 }
 885 EXPORT_SYMBOL(dev_alloc_name);
 886
 887
 888 /**
 889  *      dev_change_name - change name of a device
 890  *      @dev: device
 891  *      @newname: name (or format string) must be at least IFNAMSIZ
 892  *
 893  *      Change name of a device, can pass format strings "eth%d".
 894  *      for wildcarding.
 895  */
 896 int dev_change_name(struct net_device *dev, const char *newname)
 897 {
 898         char oldname[IFNAMSIZ];
 899         int err = 0;
 900         int ret;
 901         struct net *net;
 902
 903         ASSERT_RTNL();
 904         BUG_ON(!dev_net(dev));
 905
 906         net = dev_net(dev);
 907         if (dev->flags & IFF_UP)
 908                 return -EBUSY;
 909
 910         if (!dev_valid_name(newname))
 911                 return -EINVAL;
 912
 913         if (strncmp(newname, dev->name, IFNAMSIZ) == 0)
 914                 return 0;
 915
 916         memcpy(oldname, dev->name, IFNAMSIZ);
 917
 918         if (strchr(newname, '%')) {
 919                 err = dev_alloc_name(dev, newname);
 920                 if (err < 0)
 921                         return err;
 922         } else if (__dev_get_by_name(net, newname))
 923                 return -EEXIST;
 924         else
 925                 strlcpy(dev->name, newname, IFNAMSIZ);
 926
 927 rollback:
 928         /* For now only devices in the initial network namespace
 929          * are in sysfs.
 930          */
 931         if (net == &init_net) {
 932                 ret = device_rename(&dev->dev, dev->name);
 933                 if (ret) {
 934                         memcpy(dev->name, oldname, IFNAMSIZ);
 935                         return ret;
 936                 }
 937         }
 938
 939         write_lock_bh(&dev_base_lock);
 940         hlist_del(&dev->name_hlist);
 941         hlist_add_head(&dev->name_hlist, dev_name_hash(net, dev->name));
 942         write_unlock_bh(&dev_base_lock);
 943
 944         ret = call_netdevice_notifiers(NETDEV_CHANGENAME, dev);
 945         ret = notifier_to_errno(ret);
 946
 947         if (ret) {
 948                 /* err >= 0 after dev_alloc_name() or stores the first errno */
 949                 if (err >= 0) {
 950                         err = ret;
 951                         memcpy(dev->name, oldname, IFNAMSIZ);
 952                         goto rollback;
 953                 } else {
 954                         printk(KERN_ERR
 955                                "%s: name change rollback failed: %d.\n",
 956                                dev->name, ret);
 957                 }
 958         }
 959
 960         return err;
 961 }
 962
 963 /**
 964  *      dev_set_alias - change ifalias of a device
 965  *      @dev: device
 966  *      @alias: name up to IFALIASZ
 967  *      @len: limit of bytes to copy from info
 968  *
 969  *      Set ifalias for a device,
 970  */
 971 int dev_set_alias(struct net_device *dev, const char *alias, size_t len)
 972 {
 973         ASSERT_RTNL();
 974
 975         if (len >= IFALIASZ)
 976                 return -EINVAL;
 977
 978         if (!len) {
 979                 if (dev->ifalias) {
 980                         kfree(dev->ifalias);
 981                         dev->ifalias = NULL;
 982                 }
 983                 return 0;
 984         }
 985
 986         dev->ifalias = krealloc(dev->ifalias, len + 1, GFP_KERNEL);
 987         if (!dev->ifalias)
 988                 return -ENOMEM;
 989
 990         strlcpy(dev->ifalias, alias, len+1);
 991         return len;
 992 }
 993
 994
 995 /**
 996  *      netdev_features_change - device changes features
 997  *      @dev: device to cause notification
 998  *
 999  *      Called to indicate a device has changed features.
1000  */
1001 void netdev_features_change(struct net_device *dev)
1002 {
1003         call_netdevice_notifiers(NETDEV_FEAT_CHANGE, dev);
1004 }
1005 EXPORT_SYMBOL(netdev_features_change);
1006
1007 /**
1008  *      netdev_state_change - device changes state
1009  *      @dev: device to cause notification
1010  *
1011  *      Called to indicate a device has changed state. This function calls
1012  *      the notifier chains for netdev_chain and sends a NEWLINK message
1013  *      to the routing socket.
1014  */
1015 void netdev_state_change(struct net_device *dev)
1016 {
1017         if (dev->flags & IFF_UP) {
1018                 call_netdevice_notifiers(NETDEV_CHANGE, dev);
1019                 rtmsg_ifinfo(RTM_NEWLINK, dev, 0);
1020         }
1021 }
1022 EXPORT_SYMBOL(netdev_state_change);
1023
1024 void netdev_bonding_change(struct net_device *dev, unsigned long event)
1025 {
1026         call_netdevice_notifiers(event, dev);
1027 }
1028 EXPORT_SYMBOL(netdev_bonding_change);
1029
1030 /**
1031  *      dev_load        - load a network module
1032  *      @net: the applicable net namespace
1033  *      @name: name of interface
1034  *
1035  *      If a network interface is not present and the process has suitable
1036  *      privileges this function loads the module. If module loading is not
1037  *      available in this kernel then it becomes a nop.
1038  */
1039
1040 void dev_load(struct net *net, const char *name)
1041 {
1042         struct net_device *dev;
1043
1044         read_lock(&dev_base_lock);
1045         dev = __dev_get_by_name(net, name);
1046         read_unlock(&dev_base_lock);
1047
1048         if (!dev && capable(CAP_NET_ADMIN))
1049                 request_module("%s", name);
1050 }
1051 EXPORT_SYMBOL(dev_load);
1052
1053 /**
1054  *      dev_open        - prepare an interface for use.
1055  *      @dev:   device to open
1056  *
1057  *      Takes a device from down to up state. The device's private open
1058  *      function is invoked and then the multicast lists are loaded. Finally
1059  *      the device is moved into the up state and a %NETDEV_UP message is
1060  *      sent to the netdev notifier chain.
1061  *
1062  *      Calling this function on an active interface is a nop. On a failure
1063  *      a negative errno code is returned.
1064  */
1065 int dev_open(struct net_device *dev)
1066 {
1067         const struct net_device_ops *ops = dev->netdev_ops;
1068         int ret;
1069
1070         ASSERT_RTNL();
1071
1072         /*
1073          *      Is it already up?
1074          */
1075
1076         if (dev->flags & IFF_UP)
1077                 return 0;
1078
1079         /*
1080          *      Is it even present?
1081          */
1082         if (!netif_device_present(dev))
1083                 return -ENODEV;
1084
1085         ret = call_netdevice_notifiers(NETDEV_PRE_UP, dev);
1086         ret = notifier_to_errno(ret);
1087         if (ret)
1088                 return ret;
1089
1090         /*
1091          *      Call device private open method
1092          */
1093         set_bit(__LINK_STATE_START, &dev->state);
1094
1095         if (ops->ndo_validate_addr)
1096                 ret = ops->ndo_validate_addr(dev);
1097
1098         if (!ret && ops->ndo_open)
1099                 ret = ops->ndo_open(dev);
1100
1101         /*
1102          *      If it went open OK then:
1103          */
1104
1105         if (ret)
1106                 clear_bit(__LINK_STATE_START, &dev->state);
1107         else {
1108                 /*
1109                  *      Set the flags.
1110                  */
1111                 dev->flags |= IFF_UP;
1112
1113                 /*
1114                  *      Enable NET_DMA
1115                  */
1116                 net_dmaengine_get();
1117
1118                 /*
1119                  *      Initialize multicasting status
1120                  */
1121                 dev_set_rx_mode(dev);
1122
1123                 /*
1124                  *      Wakeup transmit queue engine
1125                  */
1126                 dev_activate(dev);
1127
1128                 /*
1129                  *      ... and announce new interface.
1130                  */
1131                 call_netdevice_notifiers(NETDEV_UP, dev);
1132         }
1133
1134         return ret;
1135 }
1136 EXPORT_SYMBOL(dev_open);
1137
1138 /**
1139  *      dev_close - shutdown an interface.
1140  *      @dev: device to shutdown
1141  *
1142  *      This function moves an active device into down state. A
1143  *      %NETDEV_GOING_DOWN is sent to the netdev notifier chain. The device
1144  *      is then deactivated and finally a %NETDEV_DOWN is sent to the notifier
1145  *      chain.
1146  */
1147 int dev_close(struct net_device *dev)
1148 {
1149         const struct net_device_ops *ops = dev->netdev_ops;
1150         ASSERT_RTNL();
1151
1152         might_sleep();
1153
1154         if (!(dev->flags & IFF_UP))
1155                 return 0;
1156
1157         /*
1158          *      Tell people we are going down, so that they can
1159          *      prepare to death, when device is still operating.
1160          */
1161         call_netdevice_notifiers(NETDEV_GOING_DOWN, dev);
1162
1163         clear_bit(__LINK_STATE_START, &dev->state);
1164
1165         /* Synchronize to scheduled poll. We cannot touch poll list,
1166          * it can be even on different cpu. So just clear netif_running().
1167          *
1168          * dev->stop() will invoke napi_disable() on all of it's
1169          * napi_struct instances on this device.
1170          */
1171         smp_mb__after_clear_bit(); /* Commit netif_running(). */
1172
1173         dev_deactivate(dev);
1174
1175         /*
1176          *      Call the device specific close. This cannot fail.
1177          *      Only if device is UP
1178          *
1179          *      We allow it to be called even after a DETACH hot-plug
1180          *      event.
1181          */
1182         if (ops->ndo_stop)
1183                 ops->ndo_stop(dev);
1184
1185         /*
1186          *      Device is now down.
1187          */
1188
1189         dev->flags &= ~IFF_UP;
1190
1191         /*
1192          * Tell people we are down
1193          */
1194         call_netdevice_notifiers(NETDEV_DOWN, dev);
1195
1196         /*
1197          *      Shutdown NET_DMA
1198          */
1199         net_dmaengine_put();
1200
1201         return 0;
1202 }
1203 EXPORT_SYMBOL(dev_close);
1204
1205
1206 /**
1207  *      dev_disable_lro - disable Large Receive Offload on a device
1208  *      @dev: device
1209  *
1210  *      Disable Large Receive Offload (LRO) on a net device.  Must be
1211  *      called under RTNL.  This is needed if received packets may be
1212  *      forwarded to another interface.
1213  */
1214 void dev_disable_lro(struct net_device *dev)
1215 {
1216         if (dev->ethtool_ops && dev->ethtool_ops->get_flags &&
1217             dev->ethtool_ops->set_flags) {
1218                 u32 flags = dev->ethtool_ops->get_flags(dev);
1219                 if (flags & ETH_FLAG_LRO) {
1220                         flags &= ~ETH_FLAG_LRO;
1221                         dev->ethtool_ops->set_flags(dev, flags);
1222                 }
1223         }
1224         WARN_ON(dev->features & NETIF_F_LRO);
1225 }
1226 EXPORT_SYMBOL(dev_disable_lro);
1227
1228
1229 static int dev_boot_phase = 1;
1230
1231 /*
1232  *      Device change register/unregister. These are not inline or static
1233  *      as we export them to the world.
1234  */
1235
1236 /**
1237  *      register_netdevice_notifier - register a network notifier block
1238  *      @nb: notifier
1239  *
1240  *      Register a notifier to be called when network device events occur.
1241  *      The notifier passed is linked into the kernel structures and must
1242  *      not be reused until it has been unregistered. A negative errno code
1243  *      is returned on a failure.
1244  *
1245  *      When registered all registration and up events are replayed
1246  *      to the new notifier to allow device to have a race free
1247  *      view of the network device list.
1248  */
1249
1250 int register_netdevice_notifier(struct notifier_block *nb)
1251 {
1252         struct net_device *dev;
1253         struct net_device *last;
1254         struct net *net;
1255         int err;
1256
1257         rtnl_lock();
1258         err = raw_notifier_chain_register(&netdev_chain, nb);
1259         if (err)
1260                 goto unlock;
1261         if (dev_boot_phase)
1262                 goto unlock;
1263         for_each_net(net) {
1264                 for_each_netdev(net, dev) {
1265                         err = nb->notifier_call(nb, NETDEV_REGISTER, dev);
1266                         err = notifier_to_errno(err);
1267                         if (err)
1268                                 goto rollback;
1269
1270                         if (!(dev->flags & IFF_UP))
1271                                 continue;
1272
1273                         nb->notifier_call(nb, NETDEV_UP, dev);
1274                 }
1275         }
1276
1277 unlock:
1278         rtnl_unlock();
1279         return err;
1280
1281 rollback:
1282         last = dev;
1283         for_each_net(net) {
1284                 for_each_netdev(net, dev) {
1285                         if (dev == last)
1286                                 break;
1287
1288                         if (dev->flags & IFF_UP) {
1289                                 nb->notifier_call(nb, NETDEV_GOING_DOWN, dev);
1290                                 nb->notifier_call(nb, NETDEV_DOWN, dev);
1291                         }
1292                         nb->notifier_call(nb, NETDEV_UNREGISTER, dev);
1293                 }
1294         }
1295
1296         raw_notifier_chain_unregister(&netdev_chain, nb);
1297         goto unlock;
1298 }
1299 EXPORT_SYMBOL(register_netdevice_notifier);
1300
1301 /**
1302  *      unregister_netdevice_notifier - unregister a network notifier block
1303  *      @nb: notifier
1304  *
1305  *      Unregister a notifier previously registered by
1306  *      register_netdevice_notifier(). The notifier is unlinked into the
1307  *      kernel structures and may then be reused. A negative errno code
1308  *      is returned on a failure.
1309  */
1310
1311 int unregister_netdevice_notifier(struct notifier_block *nb)
1312 {
1313         int err;
1314
1315         rtnl_lock();
1316         err = raw_notifier_chain_unregister(&netdev_chain, nb);
1317         rtnl_unlock();
1318         return err;
1319 }
1320 EXPORT_SYMBOL(unregister_netdevice_notifier);
1321
1322 /**
1323  *      call_netdevice_notifiers - call all network notifier blocks
1324  *      @val: value passed unmodified to notifier function
1325  *      @dev: net_device pointer passed unmodified to notifier function
1326  *
1327  *      Call all network notifier blocks.  Parameters and return value
1328  *      are as for raw_notifier_call_chain().
1329  */
1330
1331 int call_netdevice_notifiers(unsigned long val, struct net_device *dev)
1332 {
1333         return raw_notifier_call_chain(&netdev_chain, val, dev);
1334 }
1335
1336 /* When > 0 there are consumers of rx skb time stamps */
1337 static atomic_t netstamp_needed = ATOMIC_INIT(0);
1338
1339 void net_enable_timestamp(void)
1340 {
1341         atomic_inc(&netstamp_needed);
1342 }
1343 EXPORT_SYMBOL(net_enable_timestamp);
1344
1345 void net_disable_timestamp(void)
1346 {
1347         atomic_dec(&netstamp_needed);
1348 }
1349 EXPORT_SYMBOL(net_disable_timestamp);
1350
1351 static inline void net_timestamp(struct sk_buff *skb)
1352 {
1353         if (atomic_read(&netstamp_needed))
1354                 __net_timestamp(skb);
1355         else
1356                 skb->tstamp.tv64 = 0;
1357 }
1358
1359 /*
1360  *      Support routine. Sends outgoing frames to any network
1361  *      taps currently in use.
1362  */
1363
1364 static void dev_queue_xmit_nit(struct sk_buff *skb, struct net_device *dev)
1365 {
1366         struct packet_type *ptype;
1367
1368 #ifdef CONFIG_NET_CLS_ACT
1369         if (!(skb->tstamp.tv64 && (G_TC_FROM(skb->tc_verd) & AT_INGRESS)))
1370                 net_timestamp(skb);
1371 #else
1372         net_timestamp(skb);
1373 #endif
1374
1375         rcu_read_lock();
1376         list_for_each_entry_rcu(ptype, &ptype_all, list) {
1377                 /* Never send packets back to the socket
1378                  * they originated from - MvS (miquels@drinkel.ow.org)
1379                  */
1380                 if ((ptype->dev == dev || !ptype->dev) &&
1381                     (ptype->af_packet_priv == NULL ||
1382                      (struct sock *)ptype->af_packet_priv != skb->sk)) {
1383                         struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC);
1384                         if (!skb2)
1385                                 break;
1386
1387                         /* skb->nh should be correctly
1388                            set by sender, so that the second statement is
1389                            just protection against buggy protocols.
1390                          */
1391                         skb_reset_mac_header(skb2);
1392
1393                         if (skb_network_header(skb2) < skb2->data ||
1394                             skb2->network_header > skb2->tail) {
1395                                 if (net_ratelimit())
1396                                         printk(KERN_CRIT "protocol %04x is "
1397                                                "buggy, dev %s\n",
1398                                                skb2->protocol, dev->name);
1399                                 skb_reset_network_header(skb2);
1400                         }
1401
1402                         skb2->transport_header = skb2->network_header;
1403                         skb2->pkt_type = PACKET_OUTGOING;
1404                         ptype->func(skb2, skb->dev, ptype, skb->dev);
1405                 }
1406         }
1407         rcu_read_unlock();
1408 }
1409
1410
1411 static inline void __netif_reschedule(struct Qdisc *q)
1412 {
1413         struct softnet_data *sd;
1414         unsigned long flags;
1415
1416         local_irq_save(flags);
1417         sd = &__get_cpu_var(softnet_data);
1418         q->next_sched = sd->output_queue;
1419         sd->output_queue = q;
1420         raise_softirq_irqoff(NET_TX_SOFTIRQ);
1421         local_irq_restore(flags);
1422 }
1423
1424 void __netif_schedule(struct Qdisc *q)
1425 {
1426         if (!test_and_set_bit(__QDISC_STATE_SCHED, &q->state))
1427                 __netif_reschedule(q);
1428 }
1429 EXPORT_SYMBOL(__netif_schedule);
1430
1431 void dev_kfree_skb_irq(struct sk_buff *skb)
1432 {
1433         if (atomic_dec_and_test(&skb->users)) {
1434                 struct softnet_data *sd;
1435                 unsigned long flags;
1436
1437                 local_irq_save(flags);
1438                 sd = &__get_cpu_var(softnet_data);
1439                 skb->next = sd->completion_queue;
1440                 sd->completion_queue = skb;
1441                 raise_softirq_irqoff(NET_TX_SOFTIRQ);
1442                 local_irq_restore(flags);
1443         }
1444 }
1445 EXPORT_SYMBOL(dev_kfree_skb_irq);
1446
1447 void dev_kfree_skb_any(struct sk_buff *skb)
1448 {
1449         if (in_irq() || irqs_disabled())
1450                 dev_kfree_skb_irq(skb);
1451         else
1452                 dev_kfree_skb(skb);
1453 }
1454 EXPORT_SYMBOL(dev_kfree_skb_any);
1455
1456
1457 /**
1458  * netif_device_detach - mark device as removed
1459  * @dev: network device
1460  *
1461  * Mark device as removed from system and therefore no longer available.
1462  */
1463 void netif_device_detach(struct net_device *dev)
1464 {
1465         if (test_and_clear_bit(__LINK_STATE_PRESENT, &dev->state) &&
1466             netif_running(dev)) {
1467                 netif_tx_stop_all_queues(dev);
1468         }
1469 }
1470 EXPORT_SYMBOL(netif_device_detach);
1471
1472 /**
1473  * netif_device_attach - mark device as attached
1474  * @dev: network device
1475  *
1476  * Mark device as attached from system and restart if needed.
1477  */
1478 void netif_device_attach(struct net_device *dev)
1479 {
1480         if (!test_and_set_bit(__LINK_STATE_PRESENT, &dev->state) &&
1481             netif_running(dev)) {
1482                 netif_tx_wake_all_queues(dev);
1483                 __netdev_watchdog_up(dev);
1484         }
1485 }
1486 EXPORT_SYMBOL(netif_device_attach);
1487
1488 static bool can_checksum_protocol(unsigned long features, __be16 protocol)
1489 {
1490         return ((features & NETIF_F_GEN_CSUM) ||
1491                 ((features & NETIF_F_IP_CSUM) &&
1492                  protocol == htons(ETH_P_IP)) ||
1493                 ((features & NETIF_F_IPV6_CSUM) &&
1494                  protocol == htons(ETH_P_IPV6)) ||
1495                 ((features & NETIF_F_FCOE_CRC) &&
1496                  protocol == htons(ETH_P_FCOE)));
1497 }
1498
1499 static bool dev_can_checksum(struct net_device *dev, struct sk_buff *skb)
1500 {
1501         if (can_checksum_protocol(dev->features, skb->protocol))
1502                 return true;
1503
1504         if (skb->protocol == htons(ETH_P_8021Q)) {
1505                 struct vlan_ethhdr *veh = (struct vlan_ethhdr *)skb->data;
1506                 if (can_checksum_protocol(dev->features & dev->vlan_features,
1507                                           veh->h_vlan_encapsulated_proto))
1508                         return true;
1509         }
1510
1511         return false;
1512 }
1513
1514 /*
1515  * Invalidate hardware checksum when packet is to be mangled, and
1516  * complete checksum manually on outgoing path.
1517  */
1518 int skb_checksum_help(struct sk_buff *skb)
1519 {
1520         __wsum csum;
1521         int ret = 0, offset;
1522
1523         if (skb->ip_summed == CHECKSUM_COMPLETE)
1524                 goto out_set_summed;
1525
1526         if (unlikely(skb_shinfo(skb)->gso_size)) {
1527                 /* Let GSO fix up the checksum. */
1528                 goto out_set_summed;
1529         }
1530
1531         offset = skb->csum_start - skb_headroom(skb);
1532         BUG_ON(offset >= skb_headlen(skb));
1533         csum = skb_checksum(skb, offset, skb->len - offset, 0);
1534
1535         offset += skb->csum_offset;
1536         BUG_ON(offset + sizeof(__sum16) > skb_headlen(skb));
1537
1538         if (skb_cloned(skb) &&
1539             !skb_clone_writable(skb, offset + sizeof(__sum16))) {
1540                 ret = pskb_expand_head(skb, 0, 0, GFP_ATOMIC);
1541                 if (ret)
1542                         goto out;
1543         }
1544
1545         *(__sum16 *)(skb->data + offset) = csum_fold(csum);
1546 out_set_summed:
1547         skb->ip_summed = CHECKSUM_NONE;
1548 out:
1549         return ret;
1550 }
1551 EXPORT_SYMBOL(skb_checksum_help);
1552
1553 /**
1554  *      skb_gso_segment - Perform segmentation on skb.
1555  *      @skb: buffer to segment
1556  *      @features: features for the output path (see dev->features)
1557  *
1558  *      This function segments the given skb and returns a list of segments.
1559  *
1560  *      It may return NULL if the skb requires no segmentation.  This is
1561  *      only possible when GSO is used for verifying header integrity.
1562  */
1563 struct sk_buff *skb_gso_segment(struct sk_buff *skb, int features)
1564 {
1565         struct sk_buff *segs = ERR_PTR(-EPROTONOSUPPORT);
1566         struct packet_type *ptype;
1567         __be16 type = skb->protocol;
1568         int err;
1569
1570         skb_reset_mac_header(skb);
1571         skb->mac_len = skb->network_header - skb->mac_header;
1572         __skb_pull(skb, skb->mac_len);
1573
1574         if (unlikely(skb->ip_summed != CHECKSUM_PARTIAL)) {
1575                 struct net_device *dev = skb->dev;
1576                 struct ethtool_drvinfo info = {};
1577
1578                 if (dev && dev->ethtool_ops && dev->ethtool_ops->get_drvinfo)
1579                         dev->ethtool_ops->get_drvinfo(dev, &info);
1580
1581                 WARN(1, "%s: caps=(0x%lx, 0x%lx) len=%d data_len=%d "
1582                         "ip_summed=%d",
1583                      info.driver, dev ? dev->features : 0L,
1584                      skb->sk ? skb->sk->sk_route_caps : 0L,
1585                      skb->len, skb->data_len, skb->ip_summed);
1586
1587                 if (skb_header_cloned(skb) &&
1588                     (err = pskb_expand_head(skb, 0, 0, GFP_ATOMIC)))
1589                         return ERR_PTR(err);
1590         }
1591
1592         rcu_read_lock();
1593         list_for_each_entry_rcu(ptype,
1594                         &ptype_base[ntohs(type) & PTYPE_HASH_MASK], list) {
1595                 if (ptype->type == type && !ptype->dev && ptype->gso_segment) {
1596                         if (unlikely(skb->ip_summed != CHECKSUM_PARTIAL)) {
1597                                 err = ptype->gso_send_check(skb);
1598                                 segs = ERR_PTR(err);
1599                                 if (err || skb_gso_ok(skb, features))
1600                                         break;
1601                                 __skb_push(skb, (skb->data -
1602                                                  skb_network_header(skb)));
1603                         }
1604                         segs = ptype->gso_segment(skb, features);
1605                         break;
1606                 }
1607         }
1608         rcu_read_unlock();
1609
1610         __skb_push(skb, skb->data - skb_mac_header(skb));
1611
1612         return segs;
1613 }
1614 EXPORT_SYMBOL(skb_gso_segment);
1615
1616 /* Take action when hardware reception checksum errors are detected. */
1617 #ifdef CONFIG_BUG
1618 void netdev_rx_csum_fault(struct net_device *dev)
1619 {
1620         if (net_ratelimit()) {
1621                 printk(KERN_ERR "%s: hw csum failure.\n",
1622                         dev ? dev->name : "<unknown>");
1623                 dump_stack();
1624         }
1625 }
1626 EXPORT_SYMBOL(netdev_rx_csum_fault);
1627 #endif
1628
1629 /* Actually, we should eliminate this check as soon as we know, that:
1630  * 1. IOMMU is present and allows to map all the memory.
1631  * 2. No high memory really exists on this machine.
1632  */
1633
1634 static inline int illegal_highdma(struct net_device *dev, struct sk_buff *skb)
1635 {
1636 #ifdef CONFIG_HIGHMEM
1637         int i;
1638
1639         if (dev->features & NETIF_F_HIGHDMA)
1640                 return 0;
1641
1642         for (i = 0; i < skb_shinfo(skb)->nr_frags; i++)
1643                 if (PageHighMem(skb_shinfo(skb)->frags[i].page))
1644                         return 1;
1645
1646 #endif
1647         return 0;
1648 }
1649
1650 struct dev_gso_cb {
1651         void (*destructor)(struct sk_buff *skb);
1652 };
1653
1654 #define DEV_GSO_CB(skb) ((struct dev_gso_cb *)(skb)->cb)
1655
1656 static void dev_gso_skb_destructor(struct sk_buff *skb)
1657 {
1658         struct dev_gso_cb *cb;
1659
1660         do {
1661                 struct sk_buff *nskb = skb->next;
1662
1663                 skb->next = nskb->next;
1664                 nskb->next = NULL;
1665                 kfree_skb(nskb);
1666         } while (skb->next);
1667
1668         cb = DEV_GSO_CB(skb);
1669         if (cb->destructor)
1670                 cb->destructor(skb);
1671 }
1672
1673 /**
1674  *      dev_gso_segment - Perform emulated hardware segmentation on skb.
1675  *      @skb: buffer to segment
1676  *
1677  *      This function segments the given skb and stores the list of segments
1678  *      in skb->next.
1679  */
1680 static int dev_gso_segment(struct sk_buff *skb)
1681 {
1682         struct net_device *dev = skb->dev;
1683         struct sk_buff *segs;
1684         int features = dev->features & ~(illegal_highdma(dev, skb) ?
1685                                          NETIF_F_SG : 0);
1686
1687         segs = skb_gso_segment(skb, features);
1688
1689         /* Verifying header integrity only. */
1690         if (!segs)
1691                 return 0;
1692
1693         if (IS_ERR(segs))
1694                 return PTR_ERR(segs);
1695
1696         skb->next = segs;
1697         DEV_GSO_CB(skb)->destructor = skb->destructor;
1698         skb->destructor = dev_gso_skb_destructor;
1699
1700         return 0;
1701 }
1702
1703 int dev_hard_start_xmit(struct sk_buff *skb, struct net_device *dev,
1704                         struct netdev_queue *txq)
1705 {
1706         const struct net_device_ops *ops = dev->netdev_ops;
1707         int rc;
1708
1709         if (likely(!skb->next)) {
1710                 if (!list_empty(&ptype_all)
1711 #if defined(CONFIG_IMQ) || defined(CONFIG_IMQ_MODULE)
1712                     && !(skb->imq_flags & IMQ_F_ENQUEUE)
1713 #endif
1714                     )
1715                         dev_queue_xmit_nit(skb, dev);
1716
1717                 if (netif_needs_gso(dev, skb)) {
1718                         if (unlikely(dev_gso_segment(skb)))
1719                                 goto out_kfree_skb;
1720                         if (skb->next)
1721                                 goto gso;
1722                 }
1723
1724                 /*
1725                  * If device doesnt need skb->dst, release it right now while
1726                  * its hot in this cpu cache
1727                  */
1728                 if (dev->priv_flags & IFF_XMIT_DST_RELEASE)
1729                         skb_dst_drop(skb);
1730
1731                 rc = ops->ndo_start_xmit(skb, dev);
1732                 if (rc == NETDEV_TX_OK)
1733                         txq_trans_update(txq);
1734                 /*
1735                  * TODO: if skb_orphan() was called by
1736                  * dev->hard_start_xmit() (for example, the unmodified
1737                  * igb driver does that; bnx2 doesn't), then
1738                  * skb_tx_software_timestamp() will be unable to send
1739                  * back the time stamp.
1740                  *
1741                  * How can this be prevented? Always create another
1742                  * reference to the socket before calling
1743                  * dev->hard_start_xmit()? Prevent that skb_orphan()
1744                  * does anything in dev->hard_start_xmit() by clearing
1745                  * the skb destructor before the call and restoring it
1746                  * afterwards, then doing the skb_orphan() ourselves?
1747                  */
1748                 return rc;
1749         }
1750
1751 gso:
1752         do {
1753                 struct sk_buff *nskb = skb->next;
1754
1755                 skb->next = nskb->next;
1756                 nskb->next = NULL;
1757                 rc = ops->ndo_start_xmit(nskb, dev);
1758                 if (unlikely(rc != NETDEV_TX_OK)) {
1759                         nskb->next = skb->next;
1760                         skb->next = nskb;
1761                         return rc;
1762                 }
1763                 txq_trans_update(txq);
1764                 if (unlikely(netif_tx_queue_stopped(txq) && skb->next))
1765                         return NETDEV_TX_BUSY;
1766         } while (skb->next);
1767
1768         skb->destructor = DEV_GSO_CB(skb)->destructor;
1769
1770 out_kfree_skb:
1771         kfree_skb(skb);
1772         return NETDEV_TX_OK;
1773 }
1774
1775 static u32 skb_tx_hashrnd;
1776
1777 u16 skb_tx_hash(const struct net_device *dev, const struct sk_buff *skb)
1778 {
1779         u32 hash;
1780
1781         if (skb_rx_queue_recorded(skb)) {
1782                 hash = skb_get_rx_queue(skb);
1783                 while (unlikely(hash >= dev->real_num_tx_queues))
1784                         hash -= dev->real_num_tx_queues;
1785                 return hash;
1786         }
1787
1788         if (skb->sk && skb->sk->sk_hash)
1789                 hash = skb->sk->sk_hash;
1790         else
1791                 hash = skb->protocol;
1792
1793         hash = jhash_1word(hash, skb_tx_hashrnd);
1794
1795         return (u16) (((u64) hash * dev->real_num_tx_queues) >> 32);
1796 }
1797 EXPORT_SYMBOL(skb_tx_hash);
1798
1799 struct netdev_queue *dev_pick_tx(struct net_device *dev, struct sk_buff *skb)
1800 {
1801         const struct net_device_ops *ops = dev->netdev_ops;
1802         u16 queue_index = 0;
1803
1804         if (ops->ndo_select_queue)
1805                 queue_index = ops->ndo_select_queue(dev, skb);
1806         else if (dev->real_num_tx_queues > 1)
1807                 queue_index = skb_tx_hash(dev, skb);
1808
1809         skb_set_queue_mapping(skb, queue_index);
1810         return netdev_get_tx_queue(dev, queue_index);
1811 }
1812 EXPORT_SYMBOL(dev_pick_tx);
1813
1814 static inline int __dev_xmit_skb(struct sk_buff *skb, struct Qdisc *q,
1815                                  struct net_device *dev,
1816                                  struct netdev_queue *txq)
1817 {
1818         spinlock_t *root_lock = qdisc_lock(q);
1819         int rc;
1820
1821         spin_lock(root_lock);
1822         if (unlikely(test_bit(__QDISC_STATE_DEACTIVATED, &q->state))) {
1823                 kfree_skb(skb);
1824                 rc = NET_XMIT_DROP;
1825         } else if ((q->flags & TCQ_F_CAN_BYPASS) && !qdisc_qlen(q) &&
1826                    !test_and_set_bit(__QDISC_STATE_RUNNING, &q->state)) {
1827                 /*
1828                  * This is a work-conserving queue; there are no old skbs
1829                  * waiting to be sent out; and the qdisc is not running -
1830                  * xmit the skb directly.
1831                  */
1832                 __qdisc_update_bstats(q, skb->len);
1833                 if (sch_direct_xmit(skb, q, dev, txq, root_lock))
1834                         __qdisc_run(q);
1835                 else
1836                         clear_bit(__QDISC_STATE_RUNNING, &q->state);
1837
1838                 rc = NET_XMIT_SUCCESS;
1839         } else {
1840                 rc = qdisc_enqueue_root(skb, q);
1841                 qdisc_run(q);
1842         }
1843         spin_unlock(root_lock);
1844
1845         return rc;
1846 }
1847
1848 /**
1849  *      dev_queue_xmit - transmit a buffer
1850  *      @skb: buffer to transmit
1851  *
1852  *      Queue a buffer for transmission to a network device. The caller must
1853  *      have set the device and priority and built the buffer before calling
1854  *      this function. The function can be called from an interrupt.
1855  *
1856  *      A negative errno code is returned on a failure. A success does not
1857  *      guarantee the frame will be transmitted as it may be dropped due
1858  *      to congestion or traffic shaping.
1859  *
1860  * -----------------------------------------------------------------------------------
1861  *      I notice this method can also return errors from the queue disciplines,
1862  *      including NET_XMIT_DROP, which is a positive value.  So, errors can also
1863  *      be positive.
1864  *
1865  *      Regardless of the return value, the skb is consumed, so it is currently
1866  *      difficult to retry a send to this method.  (You can bump the ref count
1867  *      before sending to hold a reference for retry if you are careful.)
1868  *
1869  *      When calling this method, interrupts MUST be enabled.  This is because
1870  *      the BH enable code must have IRQs enabled so that it will not deadlock.
1871  *          --BLG
1872  */
1873 int dev_queue_xmit(struct sk_buff *skb)
1874 {
1875         struct net_device *dev = skb->dev;
1876         struct netdev_queue *txq;
1877         struct Qdisc *q;
1878         int rc = -ENOMEM;
1879
1880         /* GSO will handle the following emulations directly. */
1881         if (netif_needs_gso(dev, skb))
1882                 goto gso;
1883
1884         if (skb_has_frags(skb) &&
1885             !(dev->features & NETIF_F_FRAGLIST) &&
1886             __skb_linearize(skb))
1887                 goto out_kfree_skb;
1888
1889         /* Fragmented skb is linearized if device does not support SG,
1890          * or if at least one of fragments is in highmem and device
1891          * does not support DMA from it.
1892          */
1893         if (skb_shinfo(skb)->nr_frags &&
1894             (!(dev->features & NETIF_F_SG) || illegal_highdma(dev, skb)) &&
1895             __skb_linearize(skb))
1896                 goto out_kfree_skb;
1897
1898         /* If packet is not checksummed and device does not support
1899          * checksumming for this protocol, complete checksumming here.
1900          */
1901         if (skb->ip_summed == CHECKSUM_PARTIAL) {
1902                 skb_set_transport_header(skb, skb->csum_start -
1903                                               skb_headroom(skb));
1904                 if (!dev_can_checksum(dev, skb) && skb_checksum_help(skb))
1905                         goto out_kfree_skb;
1906         }
1907
1908 gso:
1909         /* Disable soft irqs for various locks below. Also
1910          * stops preemption for RCU.
1911          */
1912         rcu_read_lock_bh();
1913
1914         txq = dev_pick_tx(dev, skb);
1915         q = rcu_dereference(txq->qdisc);
1916
1917 #ifdef CONFIG_NET_CLS_ACT
1918         skb->tc_verd = SET_TC_AT(skb->tc_verd, AT_EGRESS);
1919 #endif
1920         if (q->enqueue) {
1921                 rc = __dev_xmit_skb(skb, q, dev, txq);
1922                 goto out;
1923         }
1924
1925         /* The device has no queue. Common case for software devices:
1926            loopback, all the sorts of tunnels...
1927
1928            Really, it is unlikely that netif_tx_lock protection is necessary
1929            here.  (f.e. loopback and IP tunnels are clean ignoring statistics
1930            counters.)
1931            However, it is possible, that they rely on protection
1932            made by us here.
1933
1934            Check this and shot the lock. It is not prone from deadlocks.
1935            Either shot noqueue qdisc, it is even simpler 8)
1936          */
1937         if (dev->flags & IFF_UP) {
1938                 int cpu = smp_processor_id(); /* ok because BHs are off */
1939
1940                 if (txq->xmit_lock_owner != cpu) {
1941
1942                         HARD_TX_LOCK(dev, txq, cpu);
1943
1944                         if (!netif_tx_queue_stopped(txq)) {
1945                                 rc = NET_XMIT_SUCCESS;
1946                                 if (!dev_hard_start_xmit(skb, dev, txq)) {
1947                                         HARD_TX_UNLOCK(dev, txq);
1948                                         goto out;
1949                                 }
1950                         }
1951                         HARD_TX_UNLOCK(dev, txq);
1952                         if (net_ratelimit())
1953                                 printk(KERN_CRIT "Virtual device %s asks to "
1954                                        "queue packet!\n", dev->name);
1955                 } else {
1956                         /* Recursion is detected! It is possible,
1957                          * unfortunately */
1958                         if (net_ratelimit())
1959                                 printk(KERN_CRIT "Dead loop on virtual device "
1960                                        "%s, fix it urgently!\n", dev->name);
1961                 }
1962         }
1963
1964         rc = -ENETDOWN;
1965         rcu_read_unlock_bh();
1966
1967 out_kfree_skb:
1968         kfree_skb(skb);
1969         return rc;
1970 out:
1971         rcu_read_unlock_bh();
1972         return rc;
1973 }
1974 EXPORT_SYMBOL(dev_queue_xmit);
1975
1976
1977 /*=======================================================================
1978                         Receiver routines
1979   =======================================================================*/
1980
1981 int netdev_max_backlog __read_mostly = 1000;
1982 int netdev_budget __read_mostly = 300;
1983 int weight_p __read_mostly = 64;            /* old backlog weight */
1984
1985 DEFINE_PER_CPU(struct netif_rx_stats, netdev_rx_stat) = { 0, };
1986
1987
1988 /**
1989  *      netif_rx        -       post buffer to the network code
1990  *      @skb: buffer to post
1991  *
1992  *      This function receives a packet from a device driver and queues it for
1993  *      the upper (protocol) levels to process.  It always succeeds. The buffer
1994  *      may be dropped during processing for congestion control or by the
1995  *      protocol layers.
1996  *
1997  *      return values:
1998  *      NET_RX_SUCCESS  (no congestion)
1999  *      NET_RX_DROP     (packet was dropped)
2000  *
2001  */
2002
2003 int netif_rx(struct sk_buff *skb)
2004 {
2005         struct softnet_data *queue;
2006         unsigned long flags;
2007
2008         /* if netpoll wants it, pretend we never saw it */
2009         if (netpoll_rx(skb))
2010                 return NET_RX_DROP;
2011
2012         if (!skb->tstamp.tv64)
2013                 net_timestamp(skb);
2014
2015         /*
2016          * The code is rearranged so that the path is the most
2017          * short when CPU is congested, but is still operating.
2018          */
2019         local_irq_save(flags);
2020         queue = &__get_cpu_var(softnet_data);
2021
2022         __get_cpu_var(netdev_rx_stat).total++;
2023         if (queue->input_pkt_queue.qlen <= netdev_max_backlog) {
2024                 if (queue->input_pkt_queue.qlen) {
2025 enqueue:
2026                         __skb_queue_tail(&queue->input_pkt_queue, skb);
2027                         local_irq_restore(flags);
2028                         return NET_RX_SUCCESS;
2029                 }
2030
2031                 napi_schedule(&queue->backlog);
2032                 goto enqueue;
2033         }
2034
2035         __get_cpu_var(netdev_rx_stat).dropped++;
2036         local_irq_restore(flags);
2037
2038         kfree_skb(skb);
2039         return NET_RX_DROP;
2040 }
2041 EXPORT_SYMBOL(netif_rx);
2042
2043 int netif_rx_ni(struct sk_buff *skb)
2044 {
2045         int err;
2046
2047         preempt_disable();
2048         err = netif_rx(skb);
2049         if (local_softirq_pending())
2050                 do_softirq();
2051         preempt_enable();
2052
2053         return err;
2054 }
2055 EXPORT_SYMBOL(netif_rx_ni);
2056
2057 static void net_tx_action(struct softirq_action *h)
2058 {
2059         struct softnet_data *sd = &__get_cpu_var(softnet_data);
2060
2061         if (sd->completion_queue) {
2062                 struct sk_buff *clist;
2063
2064                 local_irq_disable();
2065                 clist = sd->completion_queue;
2066                 sd->completion_queue = NULL;
2067                 local_irq_enable();
2068
2069                 while (clist) {
2070                         struct sk_buff *skb = clist;
2071                         clist = clist->next;
2072
2073                         WARN_ON(atomic_read(&skb->users));
2074                         __kfree_skb(skb);
2075                 }
2076         }
2077
2078         if (sd->output_queue) {
2079                 struct Qdisc *head;
2080
2081                 local_irq_disable();
2082                 head = sd->output_queue;
2083                 sd->output_queue = NULL;
2084                 local_irq_enable();
2085
2086                 while (head) {
2087                         struct Qdisc *q = head;
2088                         spinlock_t *root_lock;
2089
2090                         head = head->next_sched;
2091
2092                         root_lock = qdisc_lock(q);
2093                         if (spin_trylock(root_lock)) {
2094                                 smp_mb__before_clear_bit();
2095                                 clear_bit(__QDISC_STATE_SCHED,
2096                                           &q->state);
2097                                 qdisc_run(q);
2098                                 spin_unlock(root_lock);
2099                         } else {
2100                                 if (!test_bit(__QDISC_STATE_DEACTIVATED,
2101                                               &q->state)) {
2102                                         __netif_reschedule(q);
2103                                 } else {
2104                                         smp_mb__before_clear_bit();
2105                                         clear_bit(__QDISC_STATE_SCHED,
2106                                                   &q->state);
2107                                 }
2108                         }
2109                 }
2110         }
2111 }
2112
2113 static inline int deliver_skb(struct sk_buff *skb,
2114                               struct packet_type *pt_prev,
2115                               struct net_device *orig_dev)
2116 {
2117         atomic_inc(&skb->users);
2118         return pt_prev->func(skb, skb->dev, pt_prev, orig_dev);
2119 }
2120
2121 #if defined(CONFIG_BRIDGE) || defined (CONFIG_BRIDGE_MODULE)
2122
2123 #if defined(CONFIG_ATM_LANE) || defined(CONFIG_ATM_LANE_MODULE)
2124 /* This hook is defined here for ATM LANE */
2125 int (*br_fdb_test_addr_hook)(struct net_device *dev,
2126                              unsigned char *addr) __read_mostly;
2127 EXPORT_SYMBOL_GPL(br_fdb_test_addr_hook);
2128 #endif
2129
2130 /*
2131  * If bridge module is loaded call bridging hook.
2132  *  returns NULL if packet was consumed.
2133  */
2134 struct sk_buff *(*br_handle_frame_hook)(struct net_bridge_port *p,
2135                                         struct sk_buff *skb) __read_mostly;
2136 EXPORT_SYMBOL_GPL(br_handle_frame_hook);
2137
2138 static inline struct sk_buff *handle_bridge(struct sk_buff *skb,
2139                                             struct packet_type **pt_prev, int *ret,
2140                                             struct net_device *orig_dev)
2141 {
2142         struct net_bridge_port *port;
2143
2144         if (skb->pkt_type == PACKET_LOOPBACK ||
2145             (port = rcu_dereference(skb->dev->br_port)) == NULL)
2146                 return skb;
2147
2148         if (*pt_prev) {
2149                 *ret = deliver_skb(skb, *pt_prev, orig_dev);
2150                 *pt_prev = NULL;
2151         }
2152
2153         return br_handle_frame_hook(port, skb);
2154 }
2155 #else
2156 #define handle_bridge(skb, pt_prev, ret, orig_dev)      (skb)
2157 #endif
2158
2159 #if defined(CONFIG_MACVLAN) || defined(CONFIG_MACVLAN_MODULE)
2160 struct sk_buff *(*macvlan_handle_frame_hook)(struct sk_buff *skb) __read_mostly;
2161 EXPORT_SYMBOL_GPL(macvlan_handle_frame_hook);
2162
2163 static inline struct sk_buff *handle_macvlan(struct sk_buff *skb,
2164                                              struct packet_type **pt_prev,
2165                                              int *ret,
2166                                              struct net_device *orig_dev)
2167 {
2168         if (skb->dev->macvlan_port == NULL)
2169                 return skb;
2170
2171         if (*pt_prev) {
2172                 *ret = deliver_skb(skb, *pt_prev, orig_dev);
2173                 *pt_prev = NULL;
2174         }
2175         return macvlan_handle_frame_hook(skb);
2176 }
2177 #else
2178 #define handle_macvlan(skb, pt_prev, ret, orig_dev)     (skb)
2179 #endif
2180
2181 #ifdef CONFIG_NET_CLS_ACT
2182 /* TODO: Maybe we should just force sch_ingress to be compiled in
2183  * when CONFIG_NET_CLS_ACT is? otherwise some useless instructions
2184  * a compare and 2 stores extra right now if we dont have it on
2185  * but have CONFIG_NET_CLS_ACT
2186  * NOTE: This doesnt stop any functionality; if you dont have
2187  * the ingress scheduler, you just cant add policies on ingress.
2188  *
2189  */
2190 static int ing_filter(struct sk_buff *skb)
2191 {
2192         struct net_device *dev = skb->dev;
2193         u32 ttl = G_TC_RTTL(skb->tc_verd);
2194         struct netdev_queue *rxq;
2195         int result = TC_ACT_OK;
2196         struct Qdisc *q;
2197
2198         if (MAX_RED_LOOP < ttl++) {
2199                 printk(KERN_WARNING
2200                        "Redir loop detected Dropping packet (%d->%d)\n",
2201                        skb->iif, dev->ifindex);
2202                 return TC_ACT_SHOT;
2203         }
2204
2205         skb->tc_verd = SET_TC_RTTL(skb->tc_verd, ttl);
2206         skb->tc_verd = SET_TC_AT(skb->tc_verd, AT_INGRESS);
2207
2208         rxq = &dev->rx_queue;
2209
2210         q = rxq->qdisc;
2211         if (q != &noop_qdisc) {
2212                 spin_lock(qdisc_lock(q));
2213                 if (likely(!test_bit(__QDISC_STATE_DEACTIVATED, &q->state)))
2214                         result = qdisc_enqueue_root(skb, q);
2215                 spin_unlock(qdisc_lock(q));
2216         }
2217
2218         return result;
2219 }
2220
2221 static inline struct sk_buff *handle_ing(struct sk_buff *skb,
2222                                          struct packet_type **pt_prev,
2223                                          int *ret, struct net_device *orig_dev)
2224 {
2225         if (skb->dev->rx_queue.qdisc == &noop_qdisc)
2226                 goto out;
2227
2228         if (*pt_prev) {
2229                 *ret = deliver_skb(skb, *pt_prev, orig_dev);
2230                 *pt_prev = NULL;
2231         } else {
2232                 /* Huh? Why does turning on AF_PACKET affect this? */
2233                 skb->tc_verd = SET_TC_OK2MUNGE(skb->tc_verd);
2234         }
2235
2236         switch (ing_filter(skb)) {
2237         case TC_ACT_SHOT:
2238         case TC_ACT_STOLEN:
2239                 kfree_skb(skb);
2240                 return NULL;
2241         }
2242
2243 out:
2244         skb->tc_verd = 0;
2245         return skb;
2246 }
2247 #endif
2248
2249 /*
2250  *      netif_nit_deliver - deliver received packets to network taps
2251  *      @skb: buffer
2252  *
2253  *      This function is used to deliver incoming packets to network
2254  *      taps. It should be used when the normal netif_receive_skb path
2255  *      is bypassed, for example because of VLAN acceleration.
2256  */
2257 void netif_nit_deliver(struct sk_buff *skb)
2258 {
2259         struct packet_type *ptype;
2260
2261         if (list_empty(&ptype_all))
2262                 return;
2263
2264         skb_reset_network_header(skb);
2265         skb_reset_transport_header(skb);
2266         skb->mac_len = skb->network_header - skb->mac_header;
2267
2268         rcu_read_lock();
2269         list_for_each_entry_rcu(ptype, &ptype_all, list) {
2270                 if (!ptype->dev || ptype->dev == skb->dev)
2271                         deliver_skb(skb, ptype, skb->dev);
2272         }
2273         rcu_read_unlock();
2274 }
2275
2276 /**
2277  *      netif_receive_skb - process receive buffer from network
2278  *      @skb: buffer to process
2279  *
2280  *      netif_receive_skb() is the main receive data processing function.
2281  *      It always succeeds. The buffer may be dropped during processing
2282  *      for congestion control or by the protocol layers.
2283  *
2284  *      This function may only be called from softirq context and interrupts
2285  *      should be enabled.
2286  *
2287  *      Return values (usually ignored):
2288  *      NET_RX_SUCCESS: no congestion
2289  *      NET_RX_DROP: packet was dropped
2290  */
2291 int netif_receive_skb(struct sk_buff *skb)
2292 {
2293         struct packet_type *ptype, *pt_prev;
2294         struct net_device *orig_dev;
2295         struct net_device *null_or_orig;
2296         int ret = NET_RX_DROP;
2297         __be16 type;
2298
2299         if (!skb->tstamp.tv64)
2300                 net_timestamp(skb);
2301
2302         if (skb->vlan_tci && vlan_hwaccel_do_receive(skb))
2303                 return NET_RX_SUCCESS;
2304
2305         /* if we've gotten here through NAPI, check netpoll */
2306         if (netpoll_receive_skb(skb))
2307                 return NET_RX_DROP;
2308
2309         if (!skb->iif)
2310                 skb->iif = skb->dev->ifindex;
2311
2312         null_or_orig = NULL;
2313         orig_dev = skb->dev;
2314         if (orig_dev->master) {
2315                 if (skb_bond_should_drop(skb))
2316                         null_or_orig = orig_dev; /* deliver only exact match */
2317                 else
2318                         skb->dev = orig_dev->master;
2319         }
2320
2321         __get_cpu_var(netdev_rx_stat).total++;
2322
2323         skb_reset_network_header(skb);
2324         skb_reset_transport_header(skb);
2325         skb->mac_len = skb->network_header - skb->mac_header;
2326
2327         pt_prev = NULL;
2328
2329         rcu_read_lock();
2330
2331 #ifdef CONFIG_NET_CLS_ACT
2332         if (skb->tc_verd & TC_NCLS) {
2333                 skb->tc_verd = CLR_TC_NCLS(skb->tc_verd);
2334                 goto ncls;
2335         }
2336 #endif
2337
2338         list_for_each_entry_rcu(ptype, &ptype_all, list) {
2339                 if (ptype->dev == null_or_orig || ptype->dev == skb->dev ||
2340                     ptype->dev == orig_dev) {
2341                         if (pt_prev)
2342                                 ret = deliver_skb(skb, pt_prev, orig_dev);
2343                         pt_prev = ptype;
2344                 }
2345         }
2346
2347 #ifdef CONFIG_NET_CLS_ACT
2348         skb = handle_ing(skb, &pt_prev, &ret, orig_dev);
2349         if (!skb)
2350                 goto out;
2351 ncls:
2352 #endif
2353
2354         skb = handle_bridge(skb, &pt_prev, &ret, orig_dev);
2355         if (!skb)
2356                 goto out;
2357         skb = handle_macvlan(skb, &pt_prev, &ret, orig_dev);
2358         if (!skb)
2359                 goto out;
2360
2361         type = skb->protocol;
2362         list_for_each_entry_rcu(ptype,
2363                         &ptype_base[ntohs(type) & PTYPE_HASH_MASK], list) {
2364                 if (ptype->type == type &&
2365                     (ptype->dev == null_or_orig || ptype->dev == skb->dev ||
2366                      ptype->dev == orig_dev)) {
2367                         if (pt_prev)
2368                                 ret = deliver_skb(skb, pt_prev, orig_dev);
2369                         pt_prev = ptype;
2370                 }
2371         }
2372
2373         if (pt_prev) {
2374                 ret = pt_prev->func(skb, skb->dev, pt_prev, orig_dev);
2375         } else {
2376                 kfree_skb(skb);
2377                 /* Jamal, now you will not able to escape explaining
2378                  * me how you were going to use this. :-)
2379                  */
2380                 ret = NET_RX_DROP;
2381         }
2382
2383 out:
2384         rcu_read_unlock();
2385         return ret;
2386 }
2387 EXPORT_SYMBOL(netif_receive_skb);
2388
2389 /* Network device is going away, flush any packets still pending  */
2390 static void flush_backlog(void *arg)
2391 {
2392         struct net_device *dev = arg;
2393         struct softnet_data *queue = &__get_cpu_var(softnet_data);
2394         struct sk_buff *skb, *tmp;
2395
2396         skb_queue_walk_safe(&queue->input_pkt_queue, skb, tmp)
2397                 if (skb->dev == dev) {
2398                         __skb_unlink(skb, &queue->input_pkt_queue);
2399                         kfree_skb(skb);
2400                 }
2401 }
2402
2403 static int napi_gro_complete(struct sk_buff *skb)
2404 {
2405         struct packet_type *ptype;
2406         __be16 type = skb->protocol;
2407         struct list_head *head = &ptype_base[ntohs(type) & PTYPE_HASH_MASK];
2408         int err = -ENOENT;
2409
2410         if (NAPI_GRO_CB(skb)->count == 1) {
2411                 skb_shinfo(skb)->gso_size = 0;
2412                 goto out;
2413         }
2414
2415         rcu_read_lock();
2416         list_for_each_entry_rcu(ptype, head, list) {
2417                 if (ptype->type != type || ptype->dev || !ptype->gro_complete)
2418                         continue;
2419
2420                 err = ptype->gro_complete(skb);
2421                 break;
2422         }
2423         rcu_read_unlock();
2424
2425         if (err) {
2426                 WARN_ON(&ptype->list == head);
2427                 kfree_skb(skb);
2428                 return NET_RX_SUCCESS;
2429         }
2430
2431 out:
2432         return netif_receive_skb(skb);
2433 }
2434
2435 void napi_gro_flush(struct napi_struct *napi)
2436 {
2437         struct sk_buff *skb, *next;
2438
2439         for (skb = napi->gro_list; skb; skb = next) {
2440                 next = skb->next;
2441                 skb->next = NULL;
2442                 napi_gro_complete(skb);
2443         }
2444
2445         napi->gro_count = 0;
2446         napi->gro_list = NULL;
2447 }
2448 EXPORT_SYMBOL(napi_gro_flush);
2449
2450 int dev_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
2451 {
2452         struct sk_buff **pp = NULL;
2453         struct packet_type *ptype;
2454         __be16 type = skb->protocol;
2455         struct list_head *head = &ptype_base[ntohs(type) & PTYPE_HASH_MASK];
2456         int same_flow;
2457         int mac_len;
2458         int ret;
2459
2460         if (!(skb->dev->features & NETIF_F_GRO))
2461                 goto normal;
2462
2463         if (skb_is_gso(skb) || skb_has_frags(skb))
2464                 goto normal;
2465
2466         rcu_read_lock();
2467         list_for_each_entry_rcu(ptype, head, list) {
2468                 if (ptype->type != type || ptype->dev || !ptype->gro_receive)
2469                         continue;
2470
2471                 skb_set_network_header(skb, skb_gro_offset(skb));
2472                 mac_len = skb->network_header - skb->mac_header;
2473                 skb->mac_len = mac_len;
2474                 NAPI_GRO_CB(skb)->same_flow = 0;
2475                 NAPI_GRO_CB(skb)->flush = 0;
2476                 NAPI_GRO_CB(skb)->free = 0;
2477
2478                 pp = ptype->gro_receive(&napi->gro_list, skb);
2479                 break;
2480         }
2481         rcu_read_unlock();
2482
2483         if (&ptype->list == head)
2484                 goto normal;
2485
2486         same_flow = NAPI_GRO_CB(skb)->same_flow;
2487         ret = NAPI_GRO_CB(skb)->free ? GRO_MERGED_FREE : GRO_MERGED;
2488
2489         if (pp) {
2490                 struct sk_buff *nskb = *pp;
2491
2492                 *pp = nskb->next;
2493                 nskb->next = NULL;
2494                 napi_gro_complete(nskb);
2495                 napi->gro_count--;
2496         }
2497
2498         if (same_flow)
2499                 goto ok;
2500
2501         if (NAPI_GRO_CB(skb)->flush || napi->gro_count >= MAX_GRO_SKBS)
2502                 goto normal;
2503
2504         napi->gro_count++;
2505         NAPI_GRO_CB(skb)->count = 1;
2506         skb_shinfo(skb)->gso_size = skb_gro_len(skb);
2507         skb->next = napi->gro_list;
2508         napi->gro_list = skb;
2509         ret = GRO_HELD;
2510
2511 pull:
2512         if (skb_headlen(skb) < skb_gro_offset(skb)) {
2513                 int grow = skb_gro_offset(skb) - skb_headlen(skb);
2514
2515                 BUG_ON(skb->end - skb->tail < grow);
2516
2517                 memcpy(skb_tail_pointer(skb), NAPI_GRO_CB(skb)->frag0, grow);
2518
2519                 skb->tail += grow;
2520                 skb->data_len -= grow;
2521
2522                 skb_shinfo(skb)->frags[0].page_offset += grow;
2523                 skb_shinfo(skb)->frags[0].size -= grow;
2524
2525                 if (unlikely(!skb_shinfo(skb)->frags[0].size)) {
2526                         put_page(skb_shinfo(skb)->frags[0].page);
2527                         memmove(skb_shinfo(skb)->frags,
2528                                 skb_shinfo(skb)->frags + 1,
2529                                 --skb_shinfo(skb)->nr_frags);
2530                 }
2531         }
2532
2533 ok:
2534         return ret;
2535
2536 normal:
2537         ret = GRO_NORMAL;
2538         goto pull;
2539 }
2540 EXPORT_SYMBOL(dev_gro_receive);
2541
2542 static int __napi_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
2543 {
2544         struct sk_buff *p;
2545
2546         if (netpoll_rx_on(skb))
2547                 return GRO_NORMAL;
2548
2549         for (p = napi->gro_list; p; p = p->next) {
2550                 NAPI_GRO_CB(p)->same_flow = (p->dev == skb->dev)
2551                         && !compare_ether_header(skb_mac_header(p),
2552                                                  skb_gro_mac_header(skb));
2553                 NAPI_GRO_CB(p)->flush = 0;
2554         }
2555
2556         return dev_gro_receive(napi, skb);
2557 }
2558
2559 int napi_skb_finish(int ret, struct sk_buff *skb)
2560 {
2561         int err = NET_RX_SUCCESS;
2562
2563         switch (ret) {
2564         case GRO_NORMAL:
2565                 return netif_receive_skb(skb);
2566
2567         case GRO_DROP:
2568                 err = NET_RX_DROP;
2569                 /* fall through */
2570
2571         case GRO_MERGED_FREE:
2572                 kfree_skb(skb);
2573                 break;
2574         }
2575
2576         return err;
2577 }
2578 EXPORT_SYMBOL(napi_skb_finish);
2579
2580 void skb_gro_reset_offset(struct sk_buff *skb)
2581 {
2582         NAPI_GRO_CB(skb)->data_offset = 0;
2583         NAPI_GRO_CB(skb)->frag0 = NULL;
2584         NAPI_GRO_CB(skb)->frag0_len = 0;
2585
2586         if (skb->mac_header == skb->tail &&
2587             !PageHighMem(skb_shinfo(skb)->frags[0].page)) {
2588                 NAPI_GRO_CB(skb)->frag0 =
2589                         page_address(skb_shinfo(skb)->frags[0].page) +
2590                         skb_shinfo(skb)->frags[0].page_offset;
2591                 NAPI_GRO_CB(skb)->frag0_len = skb_shinfo(skb)->frags[0].size;
2592         }
2593 }
2594 EXPORT_SYMBOL(skb_gro_reset_offset);
2595
2596 int napi_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
2597 {
2598         skb_gro_reset_offset(skb);
2599
2600         return napi_skb_finish(__napi_gro_receive(napi, skb), skb);
2601 }
2602 EXPORT_SYMBOL(napi_gro_receive);
2603
2604 void napi_reuse_skb(struct napi_struct *napi, struct sk_buff *skb)
2605 {
2606         __skb_pull(skb, skb_headlen(skb));
2607         skb_reserve(skb, NET_IP_ALIGN - skb_headroom(skb));
2608
2609         napi->skb = skb;
2610 }
2611 EXPORT_SYMBOL(napi_reuse_skb);
2612
2613 struct sk_buff *napi_get_frags(struct napi_struct *napi)
2614 {
2615         struct net_device *dev = napi->dev;
2616         struct sk_buff *skb = napi->skb;
2617
2618         if (!skb) {
2619                 skb = netdev_alloc_skb(dev, GRO_MAX_HEAD + NET_IP_ALIGN);
2620                 if (!skb)
2621                         goto out;
2622
2623                 skb_reserve(skb, NET_IP_ALIGN);
2624
2625                 napi->skb = skb;
2626         }
2627
2628 out:
2629         return skb;
2630 }
2631 EXPORT_SYMBOL(napi_get_frags);
2632
2633 int napi_frags_finish(struct napi_struct *napi, struct sk_buff *skb, int ret)
2634 {
2635         int err = NET_RX_SUCCESS;
2636
2637         switch (ret) {
2638         case GRO_NORMAL:
2639         case GRO_HELD:
2640                 skb->protocol = eth_type_trans(skb, skb->dev);
2641
2642                 if (ret == GRO_NORMAL)
2643                         return netif_receive_skb(skb);
2644
2645                 skb_gro_pull(skb, -ETH_HLEN);
2646                 break;
2647
2648         case GRO_DROP:
2649                 err = NET_RX_DROP;
2650                 /* fall through */
2651
2652         case GRO_MERGED_FREE:
2653                 napi_reuse_skb(napi, skb);
2654                 break;
2655         }
2656
2657         return err;
2658 }
2659 EXPORT_SYMBOL(napi_frags_finish);
2660
2661 struct sk_buff *napi_frags_skb(struct napi_struct *napi)
2662 {
2663         struct sk_buff *skb = napi->skb;
2664         struct ethhdr *eth;
2665         unsigned int hlen;
2666         unsigned int off;
2667
2668         napi->skb = NULL;
2669
2670         skb_reset_mac_header(skb);
2671         skb_gro_reset_offset(skb);
2672
2673         off = skb_gro_offset(skb);
2674         hlen = off + sizeof(*eth);
2675         eth = skb_gro_header_fast(skb, off);
2676         if (skb_gro_header_hard(skb, hlen)) {
2677                 eth = skb_gro_header_slow(skb, hlen, off);
2678                 if (unlikely(!eth)) {
2679                         napi_reuse_skb(napi, skb);
2680                         skb = NULL;
2681                         goto out;
2682                 }
2683         }
2684
2685         skb_gro_pull(skb, sizeof(*eth));
2686
2687         /*
2688          * This works because the only protocols we care about don't require
2689          * special handling.  We'll fix it up properly at the end.
2690          */
2691         skb->protocol = eth->h_proto;
2692
2693 out:
2694         return skb;
2695 }
2696 EXPORT_SYMBOL(napi_frags_skb);
2697
2698 int napi_gro_frags(struct napi_struct *napi)
2699 {
2700         struct sk_buff *skb = napi_frags_skb(napi);
2701
2702         if (!skb)
2703                 return NET_RX_DROP;
2704
2705         return napi_frags_finish(napi, skb, __napi_gro_receive(napi, skb));
2706 }
2707 EXPORT_SYMBOL(napi_gro_frags);
2708
2709 static int process_backlog(struct napi_struct *napi, int quota)
2710 {
2711         int work = 0;
2712         struct softnet_data *queue = &__get_cpu_var(softnet_data);
2713         unsigned long start_time = jiffies;
2714
2715         napi->weight = weight_p;
2716         do {
2717                 struct sk_buff *skb;
2718
2719                 local_irq_disable();
2720                 skb = __skb_dequeue(&queue->input_pkt_queue);
2721                 if (!skb) {
2722                         __napi_complete(napi);
2723                         local_irq_enable();
2724                         break;
2725                 }
2726                 local_irq_enable();
2727
2728                 netif_receive_skb(skb);
2729         } while (++work < quota && jiffies == start_time);
2730
2731         return work;
2732 }
2733
2734 /**
2735  * __napi_schedule - schedule for receive
2736  * @n: entry to schedule
2737  *
2738  * The entry's receive function will be scheduled to run
2739  */
2740 void __napi_schedule(struct napi_struct *n)
2741 {
2742         unsigned long flags;
2743
2744         local_irq_save(flags);
2745         list_add_tail(&n->poll_list, &__get_cpu_var(softnet_data).poll_list);
2746         __raise_softirq_irqoff(NET_RX_SOFTIRQ);
2747         local_irq_restore(flags);
2748 }
2749 EXPORT_SYMBOL(__napi_schedule);
2750
2751 void __napi_complete(struct napi_struct *n)
2752 {
2753         BUG_ON(!test_bit(NAPI_STATE_SCHED, &n->state));
2754         BUG_ON(n->gro_list);
2755
2756         list_del(&n->poll_list);
2757         smp_mb__before_clear_bit();
2758         clear_bit(NAPI_STATE_SCHED, &n->state);
2759 }
2760 EXPORT_SYMBOL(__napi_complete);
2761
2762 void napi_complete(struct napi_struct *n)
2763 {
2764         unsigned long flags;
2765
2766         /*
2767          * don't let napi dequeue from the cpu poll list
2768          * just in case its running on a different cpu
2769          */
2770         if (unlikely(test_bit(NAPI_STATE_NPSVC, &n->state)))
2771                 return;
2772
2773         napi_gro_flush(n);
2774         local_irq_save(flags);
2775         __napi_complete(n);
2776         local_irq_restore(flags);
2777 }
2778 EXPORT_SYMBOL(napi_complete);
2779
2780 void netif_napi_add(struct net_device *dev, struct napi_struct *napi,
2781                     int (*poll)(struct napi_struct *, int), int weight)
2782 {
2783         INIT_LIST_HEAD(&napi->poll_list);
2784         napi->gro_count = 0;
2785         napi->gro_list = NULL;
2786         napi->skb = NULL;
2787         napi->poll = poll;
2788         napi->weight = weight;
2789         list_add(&napi->dev_list, &dev->napi_list);
2790         napi->dev = dev;
2791 #ifdef CONFIG_NETPOLL
2792         spin_lock_init(&napi->poll_lock);
2793         napi->poll_owner = -1;
2794 #endif
2795         set_bit(NAPI_STATE_SCHED, &napi->state);
2796 }
2797 EXPORT_SYMBOL(netif_napi_add);
2798
2799 void netif_napi_del(struct napi_struct *napi)
2800 {
2801         struct sk_buff *skb, *next;
2802
2803         list_del_init(&napi->dev_list);
2804         napi_free_frags(napi);
2805
2806         for (skb = napi->gro_list; skb; skb = next) {
2807                 next = skb->next;
2808                 skb->next = NULL;
2809                 kfree_skb(skb);
2810         }
2811
2812         napi->gro_list = NULL;
2813         napi->gro_count = 0;
2814 }
2815 EXPORT_SYMBOL(netif_napi_del);
2816
2817
2818 static void net_rx_action(struct softirq_action *h)
2819 {
2820         struct list_head *list = &__get_cpu_var(softnet_data).poll_list;
2821         unsigned long time_limit = jiffies + 2;
2822         int budget = netdev_budget;
2823         void *have;
2824
2825         local_irq_disable();
2826
2827         while (!list_empty(list)) {
2828                 struct napi_struct *n;
2829                 int work, weight;
2830
2831                 /* If softirq window is exhuasted then punt.
2832                  * Allow this to run for 2 jiffies since which will allow
2833                  * an average latency of 1.5/HZ.
2834                  */
2835                 if (unlikely(budget <= 0 || time_after(jiffies, time_limit)))
2836                         goto softnet_break;
2837
2838                 local_irq_enable();
2839
2840                 /* Even though interrupts have been re-enabled, this
2841                  * access is safe because interrupts can only add new
2842                  * entries to the tail of this list, and only ->poll()
2843                  * calls can remove this head entry from the list.
2844                  */
2845                 n = list_entry(list->next, struct napi_struct, poll_list);
2846
2847                 have = netpoll_poll_lock(n);
2848
2849                 weight = n->weight;
2850
2851                 /* This NAPI_STATE_SCHED test is for avoiding a race
2852                  * with netpoll's poll_napi().  Only the entity which
2853                  * obtains the lock and sees NAPI_STATE_SCHED set will
2854                  * actually make the ->poll() call.  Therefore we avoid
2855                  * accidently calling ->poll() when NAPI is not scheduled.
2856                  */
2857                 work = 0;
2858                 if (test_bit(NAPI_STATE_SCHED, &n->state)) {
2859                         work = n->poll(n, weight);
2860                         trace_napi_poll(n);
2861                 }
2862
2863                 WARN_ON_ONCE(work > weight);
2864
2865                 budget -= work;
2866
2867                 local_irq_disable();
2868
2869                 /* Drivers must not modify the NAPI state if they
2870                  * consume the entire weight.  In such cases this code
2871                  * still "owns" the NAPI instance and therefore can
2872                  * move the instance around on the list at-will.
2873                  */
2874                 if (unlikely(work == weight)) {
2875                         if (unlikely(napi_disable_pending(n))) {
2876                                 local_irq_enable();
2877                                 napi_complete(n);
2878                                 local_irq_disable();
2879                         } else
2880                                 list_move_tail(&n->poll_list, list);
2881                 }
2882
2883                 netpoll_poll_unlock(have);
2884         }
2885 out:
2886         local_irq_enable();
2887
2888 #ifdef CONFIG_NET_DMA
2889         /*
2890          * There may not be any more sk_buffs coming right now, so push
2891          * any pending DMA copies to hardware
2892          */
2893         dma_issue_pending_all();
2894 #endif
2895
2896         return;
2897
2898 softnet_break:
2899         __get_cpu_var(netdev_rx_stat).time_squeeze++;
2900         __raise_softirq_irqoff(NET_RX_SOFTIRQ);
2901         goto out;
2902 }
2903
2904 static gifconf_func_t *gifconf_list[NPROTO];
2905
2906 /**
2907  *      register_gifconf        -       register a SIOCGIF handler
2908  *      @family: Address family
2909  *      @gifconf: Function handler
2910  *
2911  *      Register protocol dependent address dumping routines. The handler
2912  *      that is passed must not be freed or reused until it has been replaced
2913  *      by another handler.
2914  */
2915 int register_gifconf(unsigned int family, gifconf_func_t *gifconf)
2916 {
2917         if (family >= NPROTO)
2918                 return -EINVAL;
2919         gifconf_list[family] = gifconf;
2920         return 0;
2921 }
2922 EXPORT_SYMBOL(register_gifconf);
2923
2924
2925 /*
2926  *      Map an interface index to its name (SIOCGIFNAME)
2927  */
2928
2929 /*
2930  *      We need this ioctl for efficient implementation of the
2931  *      if_indextoname() function required by the IPv6 API.  Without
2932  *      it, we would have to search all the interfaces to find a
2933  *      match.  --pb
2934  */
2935
2936 static int dev_ifname(struct net *net, struct ifreq __user *arg)
2937 {
2938         struct net_device *dev;
2939         struct ifreq ifr;
2940
2941         /*
2942          *      Fetch the caller's info block.
2943          */
2944
2945         if (copy_from_user(&ifr, arg, sizeof(struct ifreq)))
2946                 return -EFAULT;
2947
2948         read_lock(&dev_base_lock);
2949         dev = __dev_get_by_index(net, ifr.ifr_ifindex);
2950         if (!dev) {
2951                 read_unlock(&dev_base_lock);
2952                 return -ENODEV;
2953         }
2954
2955         strcpy(ifr.ifr_name, dev->name);
2956         read_unlock(&dev_base_lock);
2957
2958         if (copy_to_user(arg, &ifr, sizeof(struct ifreq)))
2959                 return -EFAULT;
2960         return 0;
2961 }
2962
2963 /*
2964  *      Perform a SIOCGIFCONF call. This structure will change
2965  *      size eventually, and there is nothing I can do about it.
2966  *      Thus we will need a 'compatibility mode'.
2967  */
2968
2969 static int dev_ifconf(struct net *net, char __user *arg)
2970 {
2971         struct ifconf ifc;
2972         struct net_device *dev;
2973         char __user *pos;
2974         int len;
2975         int total;
2976         int i;
2977
2978         /*
2979          *      Fetch the caller's info block.
2980          */
2981
2982         if (copy_from_user(&ifc, arg, sizeof(struct ifconf)))
2983                 return -EFAULT;
2984
2985         pos = ifc.ifc_buf;
2986         len = ifc.ifc_len;
2987
2988         /*
2989          *      Loop over the interfaces, and write an info block for each.
2990          */
2991
2992         total = 0;
2993         for_each_netdev(net, dev) {
2994                 for (i = 0; i < NPROTO; i++) {
2995                         if (gifconf_list[i]) {
2996                                 int done;
2997                                 if (!pos)
2998                                         done = gifconf_list[i](dev, NULL, 0);
2999                                 else
3000                                         done = gifconf_list[i](dev, pos + total,
3001                                                                len - total);
3002                                 if (done < 0)
3003                                         return -EFAULT;
3004                                 total += done;
3005                         }
3006                 }
3007         }
3008
3009         /*
3010          *      All done.  Write the updated control block back to the caller.
3011          */
3012         ifc.ifc_len = total;
3013
3014         /*
3015          *      Both BSD and Solaris return 0 here, so we do too.
3016          */
3017         return copy_to_user(arg, &ifc, sizeof(struct ifconf)) ? -EFAULT : 0;
3018 }
3019
3020 #ifdef CONFIG_PROC_FS
3021 /*
3022  *      This is invoked by the /proc filesystem handler to display a device
3023  *      in detail.
3024  */
3025 void *dev_seq_start(struct seq_file *seq, loff_t *pos)
3026         __acquires(dev_base_lock)
3027 {
3028         struct net *net = seq_file_net(seq);
3029         loff_t off;
3030         struct net_device *dev;
3031
3032         read_lock(&dev_base_lock);
3033         if (!*pos)
3034                 return SEQ_START_TOKEN;
3035
3036         off = 1;
3037         for_each_netdev(net, dev)
3038                 if (off++ == *pos)
3039                         return dev;
3040
3041         return NULL;
3042 }
3043
3044 void *dev_seq_next(struct seq_file *seq, void *v, loff_t *pos)
3045 {
3046         struct net *net = seq_file_net(seq);
3047         ++*pos;
3048         return v == SEQ_START_TOKEN ?
3049                 first_net_device(net) : next_net_device((struct net_device *)v);
3050 }
3051
3052 void dev_seq_stop(struct seq_file *seq, void *v)
3053         __releases(dev_base_lock)
3054 {
3055         read_unlock(&dev_base_lock);
3056 }
3057
3058 static void dev_seq_printf_stats(struct seq_file *seq, struct net_device *dev)
3059 {
3060         const struct net_device_stats *stats = dev_get_stats(dev);
3061
3062         seq_printf(seq, "%6s:%8lu %7lu %4lu %4lu %4lu %5lu %10lu %9lu "
3063                    "%8lu %7lu %4lu %4lu %4lu %5lu %7lu %10lu\n",
3064                    dev->name, stats->rx_bytes, stats->rx_packets,
3065                    stats->rx_errors,
3066                    stats->rx_dropped + stats->rx_missed_errors,
3067                    stats->rx_fifo_errors,
3068                    stats->rx_length_errors + stats->rx_over_errors +
3069                     stats->rx_crc_errors + stats->rx_frame_errors,
3070                    stats->rx_compressed, stats->multicast,
3071                    stats->tx_bytes, stats->tx_packets,
3072                    stats->tx_errors, stats->tx_dropped,
3073                    stats->tx_fifo_errors, stats->collisions,
3074                    stats->tx_carrier_errors +
3075                     stats->tx_aborted_errors +
3076                     stats->tx_window_errors +
3077                     stats->tx_heartbeat_errors,
3078                    stats->tx_compressed);
3079 }
3080
3081 /*
3082  *      Called from the PROCfs module. This now uses the new arbitrary sized
3083  *      /proc/net interface to create /proc/net/dev
3084  */
3085 static int dev_seq_show(struct seq_file *seq, void *v)
3086 {
3087         if (v == SEQ_START_TOKEN)
3088                 seq_puts(seq, "Inter-|   Receive                            "
3089                               "                    |  Transmit\n"
3090                               " face |bytes    packets errs drop fifo frame "
3091                               "compressed multicast|bytes    packets errs "
3092                               "drop fifo colls carrier compressed\n");
3093         else
3094                 dev_seq_printf_stats(seq, v);
3095         return 0;
3096 }
3097
3098 static struct netif_rx_stats *softnet_get_online(loff_t *pos)
3099 {
3100         struct netif_rx_stats *rc = NULL;
3101
3102         while (*pos < nr_cpu_ids)
3103                 if (cpu_online(*pos)) {
3104                         rc = &per_cpu(netdev_rx_stat, *pos);
3105                         break;
3106                 } else
3107                         ++*pos;
3108         return rc;
3109 }
3110
3111 static void *softnet_seq_start(struct seq_file *seq, loff_t *pos)
3112 {
3113         return softnet_get_online(pos);
3114 }
3115
3116 static void *softnet_seq_next(struct seq_file *seq, void *v, loff_t *pos)
3117 {
3118         ++*pos;
3119         return softnet_get_online(pos);
3120 }
3121
3122 static void softnet_seq_stop(struct seq_file *seq, void *v)
3123 {
3124 }
3125
3126 static int softnet_seq_show(struct seq_file *seq, void *v)
3127 {
3128         struct netif_rx_stats *s = v;
3129
3130         seq_printf(seq, "%08x %08x %08x %08x %08x %08x %08x %08x %08x\n",
3131                    s->total, s->dropped, s->time_squeeze, 0,
3132                    0, 0, 0, 0, /* was fastroute */
3133                    s->cpu_collision);
3134         return 0;
3135 }
3136
3137 static const struct seq_operations dev_seq_ops = {
3138         .start = dev_seq_start,
3139         .next  = dev_seq_next,
3140         .stop  = dev_seq_stop,
3141         .show  = dev_seq_show,
3142 };
3143
3144 static int dev_seq_open(struct inode *inode, struct file *file)
3145 {
3146         return seq_open_net(inode, file, &dev_seq_ops,
3147                             sizeof(struct seq_net_private));
3148 }
3149
3150 static const struct file_operations dev_seq_fops = {
3151         .owner   = THIS_MODULE,
3152         .open    = dev_seq_open,
3153         .read    = seq_read,
3154         .llseek  = seq_lseek,
3155         .release = seq_release_net,
3156 };
3157
3158 static const struct seq_operations softnet_seq_ops = {
3159         .start = softnet_seq_start,
3160         .next  = softnet_seq_next,
3161         .stop  = softnet_seq_stop,
3162         .show  = softnet_seq_show,
3163 };
3164
3165 static int softnet_seq_open(struct inode *inode, struct file *file)
3166 {
3167         return seq_open(file, &softnet_seq_ops);
3168 }
3169
3170 static const struct file_operations softnet_seq_fops = {
3171         .owner   = THIS_MODULE,
3172         .open    = softnet_seq_open,
3173         .read    = seq_read,
3174         .llseek  = seq_lseek,
3175         .release = seq_release,
3176 };
3177
3178 static void *ptype_get_idx(loff_t pos)
3179 {
3180         struct packet_type *pt = NULL;
3181         loff_t i = 0;
3182         int t;
3183
3184         list_for_each_entry_rcu(pt, &ptype_all, list) {
3185                 if (i == pos)
3186                         return pt;
3187                 ++i;
3188         }
3189
3190         for (t = 0; t < PTYPE_HASH_SIZE; t++) {
3191                 list_for_each_entry_rcu(pt, &ptype_base[t], list) {
3192                         if (i == pos)
3193                                 return pt;
3194                         ++i;
3195                 }
3196         }
3197         return NULL;
3198 }
3199
3200 static void *ptype_seq_start(struct seq_file *seq, loff_t *pos)
3201         __acquires(RCU)
3202 {
3203         rcu_read_lock();
3204         return *pos ? ptype_get_idx(*pos - 1) : SEQ_START_TOKEN;
3205 }
3206
3207 static void *ptype_seq_next(struct seq_file *seq, void *v, loff_t *pos)
3208 {
3209         struct packet_type *pt;
3210         struct list_head *nxt;
3211         int hash;
3212
3213         ++*pos;
3214         if (v == SEQ_START_TOKEN)
3215                 return ptype_get_idx(0);
3216
3217         pt = v;
3218         nxt = pt->list.next;
3219         if (pt->type == htons(ETH_P_ALL)) {
3220                 if (nxt != &ptype_all)
3221                         goto found;
3222                 hash = 0;
3223                 nxt = ptype_base[0].next;
3224         } else
3225                 hash = ntohs(pt->type) & PTYPE_HASH_MASK;
3226
3227         while (nxt == &ptype_base[hash]) {
3228                 if (++hash >= PTYPE_HASH_SIZE)
3229                         return NULL;
3230                 nxt = ptype_base[hash].next;
3231         }
3232 found:
3233         return list_entry(nxt, struct packet_type, list);
3234 }
3235
3236 static void ptype_seq_stop(struct seq_file *seq, void *v)
3237         __releases(RCU)
3238 {
3239         rcu_read_unlock();
3240 }
3241
3242 static int ptype_seq_show(struct seq_file *seq, void *v)
3243 {
3244         struct packet_type *pt = v;
3245
3246         if (v == SEQ_START_TOKEN)
3247                 seq_puts(seq, "Type Device      Function\n");
3248         else if (pt->dev == NULL || dev_net(pt->dev) == seq_file_net(seq)) {
3249                 if (pt->type == htons(ETH_P_ALL))
3250                         seq_puts(seq, "ALL ");
3251                 else
3252                         seq_printf(seq, "%04x", ntohs(pt->type));
3253
3254                 seq_printf(seq, " %-8s %pF\n",
3255                            pt->dev ? pt->dev->name : "", pt->func);
3256         }
3257
3258         return 0;
3259 }
3260
3261 static const struct seq_operations ptype_seq_ops = {
3262         .start = ptype_seq_start,
3263         .next  = ptype_seq_next,
3264         .stop  = ptype_seq_stop,
3265         .show  = ptype_seq_show,
3266 };
3267
3268 static int ptype_seq_open(struct inode *inode, struct file *file)
3269 {
3270         return seq_open_net(inode, file, &ptype_seq_ops,
3271                         sizeof(struct seq_net_private));
3272 }
3273
3274 static const struct file_operations ptype_seq_fops = {
3275         .owner   = THIS_MODULE,
3276         .open    = ptype_seq_open,
3277         .read    = seq_read,
3278         .llseek  = seq_lseek,
3279         .release = seq_release_net,
3280 };
3281
3282
3283 static int __net_init dev_proc_net_init(struct net *net)
3284 {
3285         int rc = -ENOMEM;
3286
3287         if (!proc_net_fops_create(net, "dev", S_IRUGO, &dev_seq_fops))
3288                 goto out;
3289         if (!proc_net_fops_create(net, "softnet_stat", S_IRUGO, &softnet_seq_fops))
3290                 goto out_dev;
3291         if (!proc_net_fops_create(net, "ptype", S_IRUGO, &ptype_seq_fops))
3292                 goto out_softnet;
3293
3294         if (wext_proc_init(net))
3295                 goto out_ptype;
3296         rc = 0;
3297 out:
3298         return rc;
3299 out_ptype:
3300         proc_net_remove(net, "ptype");
3301 out_softnet:
3302         proc_net_remove(net, "softnet_stat");
3303 out_dev:
3304         proc_net_remove(net, "dev");
3305         goto out;
3306 }
3307
3308 static void __net_exit dev_proc_net_exit(struct net *net)
3309 {
3310         wext_proc_exit(net);
3311
3312         proc_net_remove(net, "ptype");
3313         proc_net_remove(net, "softnet_stat");
3314         proc_net_remove(net, "dev");
3315 }
3316
3317 static struct pernet_operations __net_initdata dev_proc_ops = {
3318         .init = dev_proc_net_init,
3319         .exit = dev_proc_net_exit,
3320 };
3321
3322 static int __init dev_proc_init(void)
3323 {
3324         return register_pernet_subsys(&dev_proc_ops);
3325 }
3326 #else
3327 #define dev_proc_init() 0
3328 #endif  /* CONFIG_PROC_FS */
3329
3330
3331 /**
3332  *      netdev_set_master       -       set up master/slave pair
3333  *      @slave: slave device
3334  *      @master: new master device
3335  *
3336  *      Changes the master device of the slave. Pass %NULL to break the
3337  *      bonding. The caller must hold the RTNL semaphore. On a failure
3338  *      a negative errno code is returned. On success the reference counts
3339  *      are adjusted, %RTM_NEWLINK is sent to the routing socket and the
3340  *      function returns zero.
3341  */
3342 int netdev_set_master(struct net_device *slave, struct net_device *master)
3343 {
3344         struct net_device *old = slave->master;
3345
3346         ASSERT_RTNL();
3347
3348         if (master) {
3349                 if (old)
3350                         return -EBUSY;
3351                 dev_hold(master);
3352         }
3353
3354         slave->master = master;
3355
3356         synchronize_net();
3357
3358         if (old)
3359                 dev_put(old);
3360
3361         if (master)
3362                 slave->flags |= IFF_SLAVE;
3363         else
3364                 slave->flags &= ~IFF_SLAVE;
3365
3366         rtmsg_ifinfo(RTM_NEWLINK, slave, IFF_SLAVE);
3367         return 0;
3368 }
3369 EXPORT_SYMBOL(netdev_set_master);
3370
3371 static void dev_change_rx_flags(struct net_device *dev, int flags)
3372 {
3373         const struct net_device_ops *ops = dev->netdev_ops;
3374
3375         if ((dev->flags & IFF_UP) && ops->ndo_change_rx_flags)
3376                 ops->ndo_change_rx_flags(dev, flags);
3377 }
3378
3379 static int __dev_set_promiscuity(struct net_device *dev, int inc)
3380 {
3381         unsigned short old_flags = dev->flags;
3382         uid_t uid;
3383         gid_t gid;
3384
3385         ASSERT_RTNL();
3386
3387         dev->flags |= IFF_PROMISC;
3388         dev->promiscuity += inc;
3389         if (dev->promiscuity == 0) {
3390                 /*
3391                  * Avoid overflow.
3392                  * If inc causes overflow, untouch promisc and return error.
3393                  */
3394                 if (inc < 0)
3395                         dev->flags &= ~IFF_PROMISC;
3396                 else {
3397                         dev->promiscuity -= inc;
3398                         printk(KERN_WARNING "%s: promiscuity touches roof, "
3399                                 "set promiscuity failed, promiscuity feature "
3400                                 "of device might be broken.\n", dev->name);
3401                         return -EOVERFLOW;
3402                 }
3403         }
3404         if (dev->flags != old_flags) {
3405                 printk(KERN_INFO "device %s %s promiscuous mode\n",
3406                        dev->name, (dev->flags & IFF_PROMISC) ? "entered" :
3407                                                                "left");
3408                 if (audit_enabled) {
3409                         current_uid_gid(&uid, &gid);
3410                         audit_log(current->audit_context, GFP_ATOMIC,
3411                                 AUDIT_ANOM_PROMISCUOUS,
3412                                 "dev=%s prom=%d old_prom=%d auid=%u uid=%u gid=%u ses=%u",
3413                                 dev->name, (dev->flags & IFF_PROMISC),
3414                                 (old_flags & IFF_PROMISC),
3415                                 audit_get_loginuid(current),
3416                                 uid, gid,
3417                                 audit_get_sessionid(current));
3418                 }
3419
3420                 dev_change_rx_flags(dev, IFF_PROMISC);
3421         }
3422         return 0;
3423 }
3424
3425 /**
3426  *      dev_set_promiscuity     - update promiscuity count on a device
3427  *      @dev: device
3428  *      @inc: modifier
3429  *
3430  *      Add or remove promiscuity from a device. While the count in the device
3431  *      remains above zero the interface remains promiscuous. Once it hits zero
3432  *      the device reverts back to normal filtering operation. A negative inc
3433  *      value is used to drop promiscuity on the device.
3434  *      Return 0 if successful or a negative errno code on error.
3435  */
3436 int dev_set_promiscuity(struct net_device *dev, int inc)
3437 {
3438         unsigned short old_flags = dev->flags;
3439         int err;
3440
3441         err = __dev_set_promiscuity(dev, inc);
3442         if (err < 0)
3443                 return err;
3444         if (dev->flags != old_flags)
3445                 dev_set_rx_mode(dev);
3446         return err;
3447 }
3448 EXPORT_SYMBOL(dev_set_promiscuity);
3449
3450 /**
3451  *      dev_set_allmulti        - update allmulti count on a device
3452  *      @dev: device
3453  *      @inc: modifier
3454  *
3455  *      Add or remove reception of all multicast frames to a device. While the
3456  *      count in the device remains above zero the interface remains listening
3457  *      to all interfaces. Once it hits zero the device reverts back to normal
3458  *      filtering operation. A negative @inc value is used to drop the counter
3459  *      when releasing a resource needing all multicasts.
3460  *      Return 0 if successful or a negative errno code on error.
3461  */
3462
3463 int dev_set_allmulti(struct net_device *dev, int inc)
3464 {
3465         unsigned short old_flags = dev->flags;
3466
3467         ASSERT_RTNL();
3468
3469         dev->flags |= IFF_ALLMULTI;
3470         dev->allmulti += inc;
3471         if (dev->allmulti == 0) {
3472                 /*
3473                  * Avoid overflow.
3474                  * If inc causes overflow, untouch allmulti and return error.
3475                  */
3476                 if (inc < 0)
3477                         dev->flags &= ~IFF_ALLMULTI;
3478                 else {
3479                         dev->allmulti -= inc;
3480                         printk(KERN_WARNING "%s: allmulti touches roof, "
3481                                 "set allmulti failed, allmulti feature of "
3482                                 "device might be broken.\n", dev->name);
3483                         return -EOVERFLOW;
3484                 }
3485         }
3486         if (dev->flags ^ old_flags) {
3487                 dev_change_rx_flags(dev, IFF_ALLMULTI);
3488                 dev_set_rx_mode(dev);
3489         }
3490         return 0;
3491 }
3492 EXPORT_SYMBOL(dev_set_allmulti);
3493
3494 /*
3495  *      Upload unicast and multicast address lists to device and
3496  *      configure RX filtering. When the device doesn't support unicast
3497  *      filtering it is put in promiscuous mode while unicast addresses
3498  *      are present.
3499  */
3500 void __dev_set_rx_mode(struct net_device *dev)
3501 {
3502         const struct net_device_ops *ops = dev->netdev_ops;
3503
3504         /* dev_open will call this function so the list will stay sane. */
3505         if (!(dev->flags&IFF_UP))
3506                 return;
3507
3508         if (!netif_device_present(dev))
3509                 return;
3510
3511         if (ops->ndo_set_rx_mode)
3512                 ops->ndo_set_rx_mode(dev);
3513         else {
3514                 /* Unicast addresses changes may only happen under the rtnl,
3515                  * therefore calling __dev_set_promiscuity here is safe.
3516                  */
3517                 if (dev->uc.count > 0 && !dev->uc_promisc) {
3518                         __dev_set_promiscuity(dev, 1);
3519                         dev->uc_promisc = 1;
3520                 } else if (dev->uc.count == 0 && dev->uc_promisc) {
3521                         __dev_set_promiscuity(dev, -1);
3522                         dev->uc_promisc = 0;
3523                 }
3524
3525                 if (ops->ndo_set_multicast_list)
3526                         ops->ndo_set_multicast_list(dev);
3527         }
3528 }
3529
3530 void dev_set_rx_mode(struct net_device *dev)
3531 {
3532         netif_addr_lock_bh(dev);
3533         __dev_set_rx_mode(dev);
3534         netif_addr_unlock_bh(dev);
3535 }
3536
3537 /* hw addresses list handling functions */
3538
3539 static int __hw_addr_add(struct netdev_hw_addr_list *list, unsigned char *addr,
3540                          int addr_len, unsigned char addr_type)
3541 {
3542         struct netdev_hw_addr *ha;
3543         int alloc_size;
3544
3545         if (addr_len > MAX_ADDR_LEN)
3546                 return -EINVAL;
3547
3548         list_for_each_entry(ha, &list->list, list) {
3549                 if (!memcmp(ha->addr, addr, addr_len) &&
3550                     ha->type == addr_type) {
3551                         ha->refcount++;
3552                         return 0;
3553                 }
3554         }
3555
3556
3557         alloc_size = sizeof(*ha);
3558         if (alloc_size < L1_CACHE_BYTES)
3559                 alloc_size = L1_CACHE_BYTES;
3560         ha = kmalloc(alloc_size, GFP_ATOMIC);
3561         if (!ha)
3562                 return -ENOMEM;
3563         memcpy(ha->addr, addr, addr_len);
3564         ha->type = addr_type;
3565         ha->refcount = 1;
3566         ha->synced = false;
3567         list_add_tail_rcu(&ha->list, &list->list);
3568         list->count++;
3569         return 0;
3570 }
3571
3572 static void ha_rcu_free(struct rcu_head *head)
3573 {
3574         struct netdev_hw_addr *ha;
3575
3576         ha = container_of(head, struct netdev_hw_addr, rcu_head);
3577         kfree(ha);
3578 }
3579
3580 static int __hw_addr_del(struct netdev_hw_addr_list *list, unsigned char *addr,
3581                          int addr_len, unsigned char addr_type)
3582 {
3583         struct netdev_hw_addr *ha;
3584
3585         list_for_each_entry(ha, &list->list, list) {
3586                 if (!memcmp(ha->addr, addr, addr_len) &&
3587                     (ha->type == addr_type || !addr_type)) {
3588                         if (--ha->refcount)
3589                                 return 0;
3590                         list_del_rcu(&ha->list);
3591                         call_rcu(&ha->rcu_head, ha_rcu_free);
3592                         list->count--;
3593                         return 0;
3594                 }
3595         }
3596         return -ENOENT;
3597 }
3598
3599 static int __hw_addr_add_multiple(struct netdev_hw_addr_list *to_list,
3600                                   struct netdev_hw_addr_list *from_list,
3601                                   int addr_len,
3602                                   unsigned char addr_type)
3603 {
3604         int err;
3605         struct netdev_hw_addr *ha, *ha2;
3606         unsigned char type;
3607
3608         list_for_each_entry(ha, &from_list->list, list) {
3609                 type = addr_type ? addr_type : ha->type;
3610                 err = __hw_addr_add(to_list, ha->addr, addr_len, type);
3611                 if (err)
3612                         goto unroll;
3613         }
3614         return 0;
3615
3616 unroll:
3617         list_for_each_entry(ha2, &from_list->list, list) {
3618                 if (ha2 == ha)
3619                         break;
3620                 type = addr_type ? addr_type : ha2->type;
3621                 __hw_addr_del(to_list, ha2->addr, addr_len, type);
3622         }
3623         return err;
3624 }
3625
3626 static void __hw_addr_del_multiple(struct netdev_hw_addr_list *to_list,
3627                                    struct netdev_hw_addr_list *from_list,
3628                                    int addr_len,
3629                                    unsigned char addr_type)
3630 {
3631         struct netdev_hw_addr *ha;
3632         unsigned char type;
3633
3634         list_for_each_entry(ha, &from_list->list, list) {
3635                 type = addr_type ? addr_type : ha->type;
3636                 __hw_addr_del(to_list, ha->addr, addr_len, addr_type);
3637         }
3638 }
3639
3640 static int __hw_addr_sync(struct netdev_hw_addr_list *to_list,
3641                           struct netdev_hw_addr_list *from_list,
3642                           int addr_len)
3643 {
3644         int err = 0;
3645         struct netdev_hw_addr *ha, *tmp;
3646
3647         list_for_each_entry_safe(ha, tmp, &from_list->list, list) {
3648                 if (!ha->synced) {
3649                         err = __hw_addr_add(to_list, ha->addr,
3650                                             addr_len, ha->type);
3651                         if (err)
3652                                 break;
3653                         ha->synced = true;
3654                         ha->refcount++;
3655                 } else if (ha->refcount == 1) {
3656                         __hw_addr_del(to_list, ha->addr, addr_len, ha->type);
3657                         __hw_addr_del(from_list, ha->addr, addr_len, ha->type);
3658                 }
3659         }
3660         return err;
3661 }
3662
3663 static void __hw_addr_unsync(struct netdev_hw_addr_list *to_list,
3664                              struct netdev_hw_addr_list *from_list,
3665                              int addr_len)
3666 {
3667         struct netdev_hw_addr *ha, *tmp;
3668
3669         list_for_each_entry_safe(ha, tmp, &from_list->list, list) {
3670                 if (ha->synced) {
3671                         __hw_addr_del(to_list, ha->addr,
3672                                       addr_len, ha->type);
3673                         ha->synced = false;
3674                         __hw_addr_del(from_list, ha->addr,
3675                                       addr_len, ha->type);
3676                 }
3677         }
3678 }
3679
3680 static void __hw_addr_flush(struct netdev_hw_addr_list *list)
3681 {
3682         struct netdev_hw_addr *ha, *tmp;
3683
3684         list_for_each_entry_safe(ha, tmp, &list->list, list) {
3685                 list_del_rcu(&ha->list);
3686                 call_rcu(&ha->rcu_head, ha_rcu_free);
3687         }
3688         list->count = 0;
3689 }
3690
3691 static void __hw_addr_init(struct netdev_hw_addr_list *list)
3692 {
3693         INIT_LIST_HEAD(&list->list);
3694         list->count = 0;
3695 }
3696
3697 /* Device addresses handling functions */
3698
3699 static void dev_addr_flush(struct net_device *dev)
3700 {
3701         /* rtnl_mutex must be held here */
3702
3703         __hw_addr_flush(&dev->dev_addrs);
3704         dev->dev_addr = NULL;
3705 }
3706
3707 static int dev_addr_init(struct net_device *dev)
3708 {
3709         unsigned char addr[MAX_ADDR_LEN];
3710         struct netdev_hw_addr *ha;
3711         int err;
3712
3713         /* rtnl_mutex must be held here */
3714
3715         __hw_addr_init(&dev->dev_addrs);
3716         memset(addr, 0, sizeof(addr));
3717         err = __hw_addr_add(&dev->dev_addrs, addr, sizeof(addr),
3718                             NETDEV_HW_ADDR_T_LAN);
3719         if (!err) {
3720                 /*
3721                  * Get the first (previously created) address from the list
3722                  * and set dev_addr pointer to this location.
3723                  */
3724                 ha = list_first_entry(&dev->dev_addrs.list,
3725                                       struct netdev_hw_addr, list);
3726                 dev->dev_addr = ha->addr;
3727         }
3728         return err;
3729 }
3730
3731 /**
3732  *      dev_addr_add    - Add a device address
3733  *      @dev: device
3734  *      @addr: address to add
3735  *      @addr_type: address type
3736  *
3737  *      Add a device address to the device or increase the reference count if
3738  *      it already exists.
3739  *
3740  *      The caller must hold the rtnl_mutex.
3741  */
3742 int dev_addr_add(struct net_device *dev, unsigned char *addr,
3743                  unsigned char addr_type)
3744 {
3745         int err;
3746
3747         ASSERT_RTNL();
3748
3749         err = __hw_addr_add(&dev->dev_addrs, addr, dev->addr_len, addr_type);
3750         if (!err)
3751                 call_netdevice_notifiers(NETDEV_CHANGEADDR, dev);
3752         return err;
3753 }
3754 EXPORT_SYMBOL(dev_addr_add);
3755
3756 /**
3757  *      dev_addr_del    - Release a device address.
3758  *      @dev: device
3759  *      @addr: address to delete
3760  *      @addr_type: address type
3761  *
3762  *      Release reference to a device address and remove it from the device
3763  *      if the reference count drops to zero.
3764  *
3765  *      The caller must hold the rtnl_mutex.
3766  */
3767 int dev_addr_del(struct net_device *dev, unsigned char *addr,
3768                  unsigned char addr_type)
3769 {
3770         int err;
3771         struct netdev_hw_addr *ha;
3772
3773         ASSERT_RTNL();
3774
3775         /*
3776          * We can not remove the first address from the list because
3777          * dev->dev_addr points to that.
3778          */
3779         ha = list_first_entry(&dev->dev_addrs.list,
3780                               struct netdev_hw_addr, list);
3781         if (ha->addr == dev->dev_addr && ha->refcount == 1)
3782                 return -ENOENT;
3783
3784         err = __hw_addr_del(&dev->dev_addrs, addr, dev->addr_len,
3785                             addr_type);
3786         if (!err)
3787                 call_netdevice_notifiers(NETDEV_CHANGEADDR, dev);
3788         return err;
3789 }
3790 EXPORT_SYMBOL(dev_addr_del);
3791
3792 /**
3793  *      dev_addr_add_multiple   - Add device addresses from another device
3794  *      @to_dev: device to which addresses will be added
3795  *      @from_dev: device from which addresses will be added
3796  *      @addr_type: address type - 0 means type will be used from from_dev
3797  *
3798  *      Add device addresses of the one device to another.
3799  **
3800  *      The caller must hold the rtnl_mutex.
3801  */
3802 int dev_addr_add_multiple(struct net_device *to_dev,
3803                           struct net_device *from_dev,
3804                           unsigned char addr_type)
3805 {
3806         int err;
3807
3808         ASSERT_RTNL();
3809
3810         if (from_dev->addr_len != to_dev->addr_len)
3811                 return -EINVAL;
3812         err = __hw_addr_add_multiple(&to_dev->dev_addrs, &from_dev->dev_addrs,
3813                                      to_dev->addr_len, addr_type);
3814         if (!err)
3815                 call_netdevice_notifiers(NETDEV_CHANGEADDR, to_dev);
3816         return err;
3817 }
3818 EXPORT_SYMBOL(dev_addr_add_multiple);
3819
3820 /**
3821  *      dev_addr_del_multiple   - Delete device addresses by another device
3822  *      @to_dev: device where the addresses will be deleted
3823  *      @from_dev: device by which addresses the addresses will be deleted
3824  *      @addr_type: address type - 0 means type will used from from_dev
3825  *
3826  *      Deletes addresses in to device by the list of addresses in from device.
3827  *
3828  *      The caller must hold the rtnl_mutex.
3829  */
3830 int dev_addr_del_multiple(struct net_device *to_dev,
3831                           struct net_device *from_dev,
3832                           unsigned char addr_type)
3833 {
3834         ASSERT_RTNL();
3835
3836         if (from_dev->addr_len != to_dev->addr_len)
3837                 return -EINVAL;
3838         __hw_addr_del_multiple(&to_dev->dev_addrs, &from_dev->dev_addrs,
3839                                to_dev->addr_len, addr_type);
3840         call_netdevice_notifiers(NETDEV_CHANGEADDR, to_dev);
3841         return 0;
3842 }
3843 EXPORT_SYMBOL(dev_addr_del_multiple);
3844
3845 /* multicast addresses handling functions */
3846
3847 int __dev_addr_delete(struct dev_addr_list **list, int *count,
3848                       void *addr, int alen, int glbl)
3849 {
3850         struct dev_addr_list *da;
3851
3852         for (; (da = *list) != NULL; list = &da->next) {
3853                 if (memcmp(da->da_addr, addr, da->da_addrlen) == 0 &&
3854                     alen == da->da_addrlen) {
3855                         if (glbl) {
3856                                 int old_glbl = da->da_gusers;
3857                                 da->da_gusers = 0;
3858                                 if (old_glbl == 0)
3859                                         break;
3860                         }
3861                         if (--da->da_users)
3862                                 return 0;
3863
3864                         *list = da->next;
3865                         kfree(da);
3866                         (*count)--;
3867                         return 0;
3868                 }
3869         }
3870         return -ENOENT;
3871 }
3872
3873 int __dev_addr_add(struct dev_addr_list **list, int *count,
3874                    void *addr, int alen, int glbl)
3875 {
3876         struct dev_addr_list *da;
3877
3878         for (da = *list; da != NULL; da = da->next) {
3879                 if (memcmp(da->da_addr, addr, da->da_addrlen) == 0 &&
3880                     da->da_addrlen == alen) {
3881                         if (glbl) {
3882                                 int old_glbl = da->da_gusers;
3883                                 da->da_gusers = 1;
3884                                 if (old_glbl)
3885                                         return 0;
3886                         }
3887                         da->da_users++;
3888                         return 0;
3889                 }
3890         }
3891
3892         da = kzalloc(sizeof(*da), GFP_ATOMIC);
3893         if (da == NULL)
3894                 return -ENOMEM;
3895         memcpy(da->da_addr, addr, alen);
3896         da->da_addrlen = alen;
3897         da->da_users = 1;
3898         da->da_gusers = glbl ? 1 : 0;
3899         da->next = *list;
3900         *list = da;
3901         (*count)++;
3902         return 0;
3903 }
3904
3905 /**
3906  *      dev_unicast_delete      - Release secondary unicast address.
3907  *      @dev: device
3908  *      @addr: address to delete
3909  *
3910  *      Release reference to a secondary unicast address and remove it
3911  *      from the device if the reference count drops to zero.
3912  *
3913  *      The caller must hold the rtnl_mutex.
3914  */
3915 int dev_unicast_delete(struct net_device *dev, void *addr)
3916 {
3917         int err;
3918
3919         ASSERT_RTNL();
3920
3921         netif_addr_lock_bh(dev);
3922         err = __hw_addr_del(&dev->uc, addr, dev->addr_len,
3923                             NETDEV_HW_ADDR_T_UNICAST);
3924         if (!err)
3925                 __dev_set_rx_mode(dev);
3926         netif_addr_unlock_bh(dev);
3927         return err;
3928 }
3929 EXPORT_SYMBOL(dev_unicast_delete);
3930
3931 /**
3932  *      dev_unicast_add         - add a secondary unicast address
3933  *      @dev: device
3934  *      @addr: address to add
3935  *
3936  *      Add a secondary unicast address to the device or increase
3937  *      the reference count if it already exists.
3938  *
3939  *      The caller must hold the rtnl_mutex.
3940  */
3941 int dev_unicast_add(struct net_device *dev, void *addr)
3942 {
3943         int err;
3944
3945         ASSERT_RTNL();
3946
3947         netif_addr_lock_bh(dev);
3948         err = __hw_addr_add(&dev->uc, addr, dev->addr_len,
3949                             NETDEV_HW_ADDR_T_UNICAST);
3950         if (!err)
3951                 __dev_set_rx_mode(dev);
3952         netif_addr_unlock_bh(dev);
3953         return err;
3954 }
3955 EXPORT_SYMBOL(dev_unicast_add);
3956
3957 int __dev_addr_sync(struct dev_addr_list **to, int *to_count,
3958                     struct dev_addr_list **from, int *from_count)
3959 {
3960         struct dev_addr_list *da, *next;
3961         int err = 0;
3962
3963         da = *from;
3964         while (da != NULL) {
3965                 next = da->next;
3966                 if (!da->da_synced) {
3967                         err = __dev_addr_add(to, to_count,
3968                                              da->da_addr, da->da_addrlen, 0);
3969                         if (err < 0)
3970                                 break;
3971                         da->da_synced = 1;
3972                         da->da_users++;
3973                 } else if (da->da_users == 1) {
3974                         __dev_addr_delete(to, to_count,
3975                                           da->da_addr, da->da_addrlen, 0);
3976                         __dev_addr_delete(from, from_count,
3977                                           da->da_addr, da->da_addrlen, 0);
3978                 }
3979                 da = next;
3980         }
3981         return err;
3982 }
3983 EXPORT_SYMBOL_GPL(__dev_addr_sync);
3984
3985 void __dev_addr_unsync(struct dev_addr_list **to, int *to_count,
3986                        struct dev_addr_list **from, int *from_count)
3987 {
3988         struct dev_addr_list *da, *next;
3989
3990         da = *from;
3991         while (da != NULL) {
3992                 next = da->next;
3993                 if (da->da_synced) {
3994                         __dev_addr_delete(to, to_count,
3995                                           da->da_addr, da->da_addrlen, 0);
3996                         da->da_synced = 0;
3997                         __dev_addr_delete(from, from_count,
3998                                           da->da_addr, da->da_addrlen, 0);
3999                 }
4000                 da = next;
4001         }
4002 }
4003 EXPORT_SYMBOL_GPL(__dev_addr_unsync);
4004
4005 /**
4006  *      dev_unicast_sync - Synchronize device's unicast list to another device
4007  *      @to: destination device
4008  *      @from: source device
4009  *
4010  *      Add newly added addresses to the destination device and release
4011  *      addresses that have no users left. The source device must be
4012  *      locked by netif_tx_lock_bh.
4013  *
4014  *      This function is intended to be called from the dev->set_rx_mode
4015  *      function of layered software devices.
4016  */
4017 int dev_unicast_sync(struct net_device *to, struct net_device *from)
4018 {
4019         int err = 0;
4020
4021         if (to->addr_len != from->addr_len)
4022                 return -EINVAL;
4023
4024         netif_addr_lock_bh(to);
4025         err = __hw_addr_sync(&to->uc, &from->uc, to->addr_len);
4026         if (!err)
4027                 __dev_set_rx_mode(to);
4028         netif_addr_unlock_bh(to);
4029         return err;
4030 }
4031 EXPORT_SYMBOL(dev_unicast_sync);
4032
4033 /**
4034  *      dev_unicast_unsync - Remove synchronized addresses from the destination device
4035  *      @to: destination device
4036  *      @from: source device
4037  *
4038  *      Remove all addresses that were added to the destination device by
4039  *      dev_unicast_sync(). This function is intended to be called from the
4040  *      dev->stop function of layered software devices.
4041  */
4042 void dev_unicast_unsync(struct net_device *to, struct net_device *from)
4043 {
4044         if (to->addr_len != from->addr_len)
4045                 return;
4046
4047         netif_addr_lock_bh(from);
4048         netif_addr_lock(to);
4049         __hw_addr_unsync(&to->uc, &from->uc, to->addr_len);
4050         __dev_set_rx_mode(to);
4051         netif_addr_unlock(to);
4052         netif_addr_unlock_bh(from);
4053 }
4054 EXPORT_SYMBOL(dev_unicast_unsync);
4055
4056 static void dev_unicast_flush(struct net_device *dev)
4057 {
4058         netif_addr_lock_bh(dev);
4059         __hw_addr_flush(&dev->uc);
4060         netif_addr_unlock_bh(dev);
4061 }
4062
4063 static void dev_unicast_init(struct net_device *dev)
4064 {
4065         __hw_addr_init(&dev->uc);
4066 }
4067
4068
4069 static void __dev_addr_discard(struct dev_addr_list **list)
4070 {
4071         struct dev_addr_list *tmp;
4072
4073         while (*list != NULL) {
4074                 tmp = *list;
4075                 *list = tmp->next;
4076                 if (tmp->da_users > tmp->da_gusers)
4077                         printk("__dev_addr_discard: address leakage! "
4078                                "da_users=%d\n", tmp->da_users);
4079                 kfree(tmp);
4080         }
4081 }
4082
4083 static void dev_addr_discard(struct net_device *dev)
4084 {
4085         netif_addr_lock_bh(dev);
4086
4087         __dev_addr_discard(&dev->mc_list);
4088         dev->mc_count = 0;
4089
4090         netif_addr_unlock_bh(dev);
4091 }
4092
4093 /**
4094  *      dev_get_flags - get flags reported to userspace
4095  *      @dev: device
4096  *
4097  *      Get the combination of flag bits exported through APIs to userspace.
4098  */
4099 unsigned dev_get_flags(const struct net_device *dev)
4100 {
4101         unsigned flags;
4102
4103         flags = (dev->flags & ~(IFF_PROMISC |
4104                                 IFF_ALLMULTI |
4105                                 IFF_RUNNING |
4106                                 IFF_LOWER_UP |
4107                                 IFF_DORMANT)) |
4108                 (dev->gflags & (IFF_PROMISC |
4109                                 IFF_ALLMULTI));
4110
4111         if (netif_running(dev)) {
4112                 if (netif_oper_up(dev))
4113                         flags |= IFF_RUNNING;
4114                 if (netif_carrier_ok(dev))
4115                         flags |= IFF_LOWER_UP;
4116                 if (netif_dormant(dev))
4117                         flags |= IFF_DORMANT;
4118         }
4119
4120         return flags;
4121 }
4122 EXPORT_SYMBOL(dev_get_flags);
4123
4124 /**
4125  *      dev_change_flags - change device settings
4126  *      @dev: device
4127  *      @flags: device state flags
4128  *
4129  *      Change settings on device based state flags. The flags are
4130  *      in the userspace exported format.
4131  */
4132 int dev_change_flags(struct net_device *dev, unsigned flags)
4133 {
4134         int ret, changes;
4135         int old_flags = dev->flags;
4136
4137         ASSERT_RTNL();
4138
4139         /*
4140          *      Set the flags on our device.
4141          */
4142
4143         dev->flags = (flags & (IFF_DEBUG | IFF_NOTRAILERS | IFF_NOARP |
4144                                IFF_DYNAMIC | IFF_MULTICAST | IFF_PORTSEL |
4145                                IFF_AUTOMEDIA)) |
4146                      (dev->flags & (IFF_UP | IFF_VOLATILE | IFF_PROMISC |
4147                                     IFF_ALLMULTI));
4148
4149         /*
4150          *      Load in the correct multicast list now the flags have changed.
4151          */
4152
4153         if ((old_flags ^ flags) & IFF_MULTICAST)
4154                 dev_change_rx_flags(dev, IFF_MULTICAST);
4155
4156         dev_set_rx_mode(dev);
4157
4158         /*
4159          *      Have we downed the interface. We handle IFF_UP ourselves
4160          *      according to user attempts to set it, rather than blindly
4161          *      setting it.
4162          */
4163
4164         ret = 0;
4165         if ((old_flags ^ flags) & IFF_UP) {     /* Bit is different  ? */
4166                 ret = ((old_flags & IFF_UP) ? dev_close : dev_open)(dev);
4167
4168                 if (!ret)
4169                         dev_set_rx_mode(dev);
4170         }
4171
4172         if (dev->flags & IFF_UP &&
4173             ((old_flags ^ dev->flags) & ~(IFF_UP | IFF_PROMISC | IFF_ALLMULTI |
4174                                           IFF_VOLATILE)))
4175                 call_netdevice_notifiers(NETDEV_CHANGE, dev);
4176
4177         if ((flags ^ dev->gflags) & IFF_PROMISC) {
4178                 int inc = (flags & IFF_PROMISC) ? 1 : -1;
4179
4180                 dev->gflags ^= IFF_PROMISC;
4181                 dev_set_promiscuity(dev, inc);
4182         }
4183
4184         /* NOTE: order of synchronization of IFF_PROMISC and IFF_ALLMULTI
4185            is important. Some (broken) drivers set IFF_PROMISC, when
4186            IFF_ALLMULTI is requested not asking us and not reporting.
4187          */
4188         if ((flags ^ dev->gflags) & IFF_ALLMULTI) {
4189                 int inc = (flags & IFF_ALLMULTI) ? 1 : -1;
4190
4191                 dev->gflags ^= IFF_ALLMULTI;
4192                 dev_set_allmulti(dev, inc);
4193         }
4194
4195         /* Exclude state transition flags, already notified */
4196         changes = (old_flags ^ dev->flags) & ~(IFF_UP | IFF_RUNNING);
4197         if (changes)
4198                 rtmsg_ifinfo(RTM_NEWLINK, dev, changes);
4199
4200         return ret;
4201 }
4202 EXPORT_SYMBOL(dev_change_flags);
4203
4204 /**
4205  *      dev_set_mtu - Change maximum transfer unit
4206  *      @dev: device
4207  *      @new_mtu: new transfer unit
4208  *
4209  *      Change the maximum transfer size of the network device.
4210  */
4211 int dev_set_mtu(struct net_device *dev, int new_mtu)
4212 {
4213         const struct net_device_ops *ops = dev->netdev_ops;
4214         int err;
4215
4216         if (new_mtu == dev->mtu)
4217                 return 0;
4218
4219         /*      MTU must be positive.    */
4220         if (new_mtu < 0)
4221                 return -EINVAL;
4222
4223         if (!netif_device_present(dev))
4224                 return -ENODEV;
4225
4226         err = 0;
4227         if (ops->ndo_change_mtu)
4228                 err = ops->ndo_change_mtu(dev, new_mtu);
4229         else
4230                 dev->mtu = new_mtu;
4231
4232         if (!err && dev->flags & IFF_UP)
4233                 call_netdevice_notifiers(NETDEV_CHANGEMTU, dev);
4234         return err;
4235 }
4236 EXPORT_SYMBOL(dev_set_mtu);
4237
4238 /**
4239  *      dev_set_mac_address - Change Media Access Control Address
4240  *      @dev: device
4241  *      @sa: new address
4242  *
4243  *      Change the hardware (MAC) address of the device
4244  */
4245 int dev_set_mac_address(struct net_device *dev, struct sockaddr *sa)
4246 {
4247         const struct net_device_ops *ops = dev->netdev_ops;
4248         int err;
4249
4250         if (!ops->ndo_set_mac_address)
4251                 return -EOPNOTSUPP;
4252         if (sa->sa_family != dev->type)
4253                 return -EINVAL;
4254         if (!netif_device_present(dev))
4255                 return -ENODEV;
4256         err = ops->ndo_set_mac_address(dev, sa);
4257         if (!err)
4258                 call_netdevice_notifiers(NETDEV_CHANGEADDR, dev);
4259         return err;
4260 }
4261 EXPORT_SYMBOL(dev_set_mac_address);
4262
4263 /*
4264  *      Perform the SIOCxIFxxx calls, inside read_lock(dev_base_lock)
4265  */
4266 static int dev_ifsioc_locked(struct net *net, struct ifreq *ifr, unsigned int cmd)
4267 {
4268         int err;
4269         struct net_device *dev = __dev_get_by_name(net, ifr->ifr_name);
4270
4271         if (!dev)
4272                 return -ENODEV;
4273
4274         switch (cmd) {
4275         case SIOCGIFFLAGS:      /* Get interface flags */
4276                 ifr->ifr_flags = (short) dev_get_flags(dev);
4277                 return 0;
4278
4279         case SIOCGIFMETRIC:     /* Get the metric on the interface
4280                                    (currently unused) */
4281                 ifr->ifr_metric = 0;
4282                 return 0;
4283
4284         case SIOCGIFMTU:        /* Get the MTU of a device */
4285                 ifr->ifr_mtu = dev->mtu;
4286                 return 0;
4287
4288         case SIOCGIFHWADDR:
4289                 if (!dev->addr_len)
4290                         memset(ifr->ifr_hwaddr.sa_data, 0, sizeof ifr->ifr_hwaddr.sa_data);
4291                 else
4292                         memcpy(ifr->ifr_hwaddr.sa_data, dev->dev_addr,
4293                                min(sizeof ifr->ifr_hwaddr.sa_data, (size_t) dev->addr_len));
4294                 ifr->ifr_hwaddr.sa_family = dev->type;
4295                 return 0;
4296
4297         case SIOCGIFSLAVE:
4298                 err = -EINVAL;
4299                 break;
4300
4301         case SIOCGIFMAP:
4302                 ifr->ifr_map.mem_start = dev->mem_start;
4303                 ifr->ifr_map.mem_end   = dev->mem_end;
4304                 ifr->ifr_map.base_addr = dev->base_addr;
4305                 ifr->ifr_map.irq       = dev->irq;
4306                 ifr->ifr_map.dma       = dev->dma;
4307                 ifr->ifr_map.port      = dev->if_port;
4308                 return 0;
4309
4310         case SIOCGIFINDEX:
4311                 ifr->ifr_ifindex = dev->ifindex;
4312                 return 0;
4313
4314         case SIOCGIFTXQLEN:
4315                 ifr->ifr_qlen = dev->tx_queue_len;
4316                 return 0;
4317
4318         default:
4319                 /* dev_ioctl() should ensure this case
4320                  * is never reached
4321                  */
4322                 WARN_ON(1);
4323                 err = -EINVAL;
4324                 break;
4325
4326         }
4327         return err;
4328 }
4329
4330 /*
4331  *      Perform the SIOCxIFxxx calls, inside rtnl_lock()
4332  */
4333 static int dev_ifsioc(struct net *net, struct ifreq *ifr, unsigned int cmd)
4334 {
4335         int err;
4336         struct net_device *dev = __dev_get_by_name(net, ifr->ifr_name);
4337         const struct net_device_ops *ops;
4338
4339         if (!dev)
4340                 return -ENODEV;
4341
4342         ops = dev->netdev_ops;
4343
4344         switch (cmd) {
4345         case SIOCSIFFLAGS:      /* Set interface flags */
4346                 return dev_change_flags(dev, ifr->ifr_flags);
4347
4348         case SIOCSIFMETRIC:     /* Set the metric on the interface
4349                                    (currently unused) */
4350                 return -EOPNOTSUPP;
4351
4352         case SIOCSIFMTU:        /* Set the MTU of a device */
4353                 return dev_set_mtu(dev, ifr->ifr_mtu);
4354
4355         case SIOCSIFHWADDR:
4356                 return dev_set_mac_address(dev, &ifr->ifr_hwaddr);
4357
4358         case SIOCSIFHWBROADCAST:
4359                 if (ifr->ifr_hwaddr.sa_family != dev->type)
4360                         return -EINVAL;
4361                 memcpy(dev->broadcast, ifr->ifr_hwaddr.sa_data,
4362                        min(sizeof ifr->ifr_hwaddr.sa_data, (size_t) dev->addr_len));
4363                 call_netdevice_notifiers(NETDEV_CHANGEADDR, dev);
4364                 return 0;
4365
4366         case SIOCSIFMAP:
4367                 if (ops->ndo_set_config) {
4368                         if (!netif_device_present(dev))
4369                                 return -ENODEV;
4370                         return ops->ndo_set_config(dev, &ifr->ifr_map);
4371                 }
4372                 return -EOPNOTSUPP;
4373
4374         case SIOCADDMULTI:
4375                 if ((!ops->ndo_set_multicast_list && !ops->ndo_set_rx_mode) ||
4376                     ifr->ifr_hwaddr.sa_family != AF_UNSPEC)
4377                         return -EINVAL;
4378                 if (!netif_device_present(dev))
4379                         return -ENODEV;
4380                 return dev_mc_add(dev, ifr->ifr_hwaddr.sa_data,
4381                                   dev->addr_len, 1);
4382
4383         case SIOCDELMULTI:
4384                 if ((!ops->ndo_set_multicast_list && !ops->ndo_set_rx_mode) ||
4385                     ifr->ifr_hwaddr.sa_family != AF_UNSPEC)
4386                         return -EINVAL;
4387                 if (!netif_device_present(dev))
4388                         return -ENODEV;
4389                 return dev_mc_delete(dev, ifr->ifr_hwaddr.sa_data,
4390                                      dev->addr_len, 1);
4391
4392         case SIOCSIFTXQLEN:
4393                 if (ifr->ifr_qlen < 0)
4394                         return -EINVAL;
4395                 dev->tx_queue_len = ifr->ifr_qlen;
4396                 return 0;
4397
4398         case SIOCSIFNAME:
4399                 ifr->ifr_newname[IFNAMSIZ-1] = '\0';
4400                 return dev_change_name(dev, ifr->ifr_newname);
4401
4402         /*
4403          *      Unknown or private ioctl
4404          */
4405         default:
4406                 if ((cmd >= SIOCDEVPRIVATE &&
4407                     cmd <= SIOCDEVPRIVATE + 15) ||
4408                     cmd == SIOCBONDENSLAVE ||
4409                     cmd == SIOCBONDRELEASE ||
4410                     cmd == SIOCBONDSETHWADDR ||
4411                     cmd == SIOCBONDSLAVEINFOQUERY ||
4412                     cmd == SIOCBONDINFOQUERY ||
4413                     cmd == SIOCBONDCHANGEACTIVE ||
4414                     cmd == SIOCGMIIPHY ||
4415                     cmd == SIOCGMIIREG ||
4416                     cmd == SIOCSMIIREG ||
4417                     cmd == SIOCBRADDIF ||
4418                     cmd == SIOCBRDELIF ||
4419                     cmd == SIOCSHWTSTAMP ||
4420                     cmd == SIOCWANDEV) {
4421                         err = -EOPNOTSUPP;
4422                         if (ops->ndo_do_ioctl) {
4423                                 if (netif_device_present(dev))
4424                                         err = ops->ndo_do_ioctl(dev, ifr, cmd);
4425                                 else
4426                                         err = -ENODEV;
4427                         }
4428                 } else
4429                         err = -EINVAL;
4430
4431         }
4432         return err;
4433 }
4434
4435 /*
4436  *      This function handles all "interface"-type I/O control requests. The actual
4437  *      'doing' part of this is dev_ifsioc above.
4438  */
4439
4440 /**
4441  *      dev_ioctl       -       network device ioctl
4442  *      @net: the applicable net namespace
4443  *      @cmd: command to issue
4444  *      @arg: pointer to a struct ifreq in user space
4445  *
4446  *      Issue ioctl functions to devices. This is normally called by the
4447  *      user space syscall interfaces but can sometimes be useful for
4448  *      other purposes. The return value is the return from the syscall if
4449  *      positive or a negative errno code on error.
4450  */
4451
4452 int dev_ioctl(struct net *net, unsigned int cmd, void __user *arg)
4453 {
4454         struct ifreq ifr;
4455         int ret;
4456         char *colon;
4457
4458         /* One special case: SIOCGIFCONF takes ifconf argument
4459            and requires shared lock, because it sleeps writing
4460            to user space.
4461          */
4462
4463         if (cmd == SIOCGIFCONF) {
4464                 rtnl_lock();
4465                 ret = dev_ifconf(net, (char __user *) arg);
4466                 rtnl_unlock();
4467                 return ret;
4468         }
4469         if (cmd == SIOCGIFNAME)
4470                 return dev_ifname(net, (struct ifreq __user *)arg);
4471
4472         if (copy_from_user(&ifr, arg, sizeof(struct ifreq)))
4473                 return -EFAULT;
4474
4475         ifr.ifr_name[IFNAMSIZ-1] = 0;
4476
4477         colon = strchr(ifr.ifr_name, ':');
4478         if (colon)
4479                 *colon = 0;
4480
4481         /*
4482          *      See which interface the caller is talking about.
4483          */
4484
4485         switch (cmd) {
4486         /*
4487          *      These ioctl calls:
4488          *      - can be done by all.
4489          *      - atomic and do not require locking.
4490          *      - return a value
4491          */
4492         case SIOCGIFFLAGS:
4493         case SIOCGIFMETRIC:
4494         case SIOCGIFMTU:
4495         case SIOCGIFHWADDR:
4496         case SIOCGIFSLAVE:
4497         case SIOCGIFMAP:
4498         case SIOCGIFINDEX:
4499         case SIOCGIFTXQLEN:
4500                 dev_load(net, ifr.ifr_name);
4501                 read_lock(&dev_base_lock);
4502                 ret = dev_ifsioc_locked(net, &ifr, cmd);
4503                 read_unlock(&dev_base_lock);
4504                 if (!ret) {
4505                         if (colon)
4506                                 *colon = ':';
4507                         if (copy_to_user(arg, &ifr,
4508                                          sizeof(struct ifreq)))
4509                                 ret = -EFAULT;
4510                 }
4511                 return ret;
4512
4513         case SIOCETHTOOL:
4514                 dev_load(net, ifr.ifr_name);
4515                 rtnl_lock();
4516                 ret = dev_ethtool(net, &ifr);
4517                 rtnl_unlock();
4518                 if (!ret) {
4519                         if (colon)
4520                                 *colon = ':';
4521                         if (copy_to_user(arg, &ifr,
4522                                          sizeof(struct ifreq)))
4523                                 ret = -EFAULT;
4524                 }
4525                 return ret;
4526
4527         /*
4528          *      These ioctl calls:
4529          *      - require superuser power.
4530          *      - require strict serialization.
4531          *      - return a value
4532          */
4533         case SIOCGMIIPHY:
4534         case SIOCGMIIREG:
4535         case SIOCSIFNAME:
4536                 if (!capable(CAP_NET_ADMIN))
4537                         return -EPERM;
4538                 dev_load(net, ifr.ifr_name);
4539                 rtnl_lock();
4540                 ret = dev_ifsioc(net, &ifr, cmd);
4541                 rtnl_unlock();
4542                 if (!ret) {
4543                         if (colon)
4544                                 *colon = ':';
4545                         if (copy_to_user(arg, &ifr,
4546                                          sizeof(struct ifreq)))
4547                                 ret = -EFAULT;
4548                 }
4549                 return ret;
4550
4551         /*
4552          *      These ioctl calls:
4553          *      - require superuser power.
4554          *      - require strict serialization.
4555          *      - do not return a value
4556          */
4557         case SIOCSIFFLAGS:
4558         case SIOCSIFMETRIC:
4559         case SIOCSIFMTU:
4560         case SIOCSIFMAP:
4561         case SIOCSIFHWADDR:
4562         case SIOCSIFSLAVE:
4563         case SIOCADDMULTI:
4564         case SIOCDELMULTI:
4565         case SIOCSIFHWBROADCAST:
4566         case SIOCSIFTXQLEN:
4567         case SIOCSMIIREG:
4568         case SIOCBONDENSLAVE:
4569         case SIOCBONDRELEASE:
4570         case SIOCBONDSETHWADDR:
4571         case SIOCBONDCHANGEACTIVE:
4572         case SIOCBRADDIF:
4573         case SIOCBRDELIF:
4574         case SIOCSHWTSTAMP:
4575                 if (!capable(CAP_NET_ADMIN))
4576                         return -EPERM;
4577                 /* fall through */
4578         case SIOCBONDSLAVEINFOQUERY:
4579         case SIOCBONDINFOQUERY:
4580                 dev_load(net, ifr.ifr_name);
4581                 rtnl_lock();
4582                 ret = dev_ifsioc(net, &ifr, cmd);
4583                 rtnl_unlock();
4584                 return ret;
4585
4586         case SIOCGIFMEM:
4587                 /* Get the per device memory space. We can add this but
4588                  * currently do not support it */
4589         case SIOCSIFMEM:
4590                 /* Set the per device memory buffer space.
4591                  * Not applicable in our case */
4592         case SIOCSIFLINK:
4593                 return -EINVAL;
4594
4595         /*
4596          *      Unknown or private ioctl.
4597          */
4598         default:
4599                 if (cmd == SIOCWANDEV ||
4600                     (cmd >= SIOCDEVPRIVATE &&
4601                      cmd <= SIOCDEVPRIVATE + 15)) {
4602                         dev_load(net, ifr.ifr_name);
4603                         rtnl_lock();
4604                         ret = dev_ifsioc(net, &ifr, cmd);
4605                         rtnl_unlock();
4606                         if (!ret && copy_to_user(arg, &ifr,
4607                                                  sizeof(struct ifreq)))
4608                                 ret = -EFAULT;
4609                         return ret;
4610                 }
4611                 /* Take care of Wireless Extensions */
4612                 if (cmd >= SIOCIWFIRST && cmd <= SIOCIWLAST)
4613                         return wext_handle_ioctl(net, &ifr, cmd, arg);
4614                 return -EINVAL;
4615         }
4616 }
4617
4618
4619 /**
4620  *      dev_new_index   -       allocate an ifindex
4621  *      @net: the applicable net namespace
4622  *
4623  *      Returns a suitable unique value for a new device interface
4624  *      number.  The caller must hold the rtnl semaphore or the
4625  *      dev_base_lock to be sure it remains unique.
4626  */
4627 static int dev_new_index(struct net *net)
4628 {
4629         static int ifindex;
4630         for (;;) {
4631                 if (++ifindex <= 0)
4632                         ifindex = 1;
4633                 if (!__dev_get_by_index(net, ifindex))
4634                         return ifindex;
4635         }
4636 }
4637
4638 /* Delayed registration/unregisteration */
4639 static LIST_HEAD(net_todo_list);
4640
4641 static void net_set_todo(struct net_device *dev)
4642 {
4643         list_add_tail(&dev->todo_list, &net_todo_list);
4644 }
4645
4646 static void rollback_registered(struct net_device *dev)
4647 {
4648         BUG_ON(dev_boot_phase);
4649         ASSERT_RTNL();
4650
4651         /* Some devices call without registering for initialization unwind. */
4652         if (dev->reg_state == NETREG_UNINITIALIZED) {
4653                 printk(KERN_DEBUG "unregister_netdevice: device %s/%p never "
4654                                   "was registered\n", dev->name, dev);
4655
4656                 WARN_ON(1);
4657                 return;
4658         }
4659
4660         BUG_ON(dev->reg_state != NETREG_REGISTERED);
4661
4662         /* If device is running, close it first. */
4663         dev_close(dev);
4664
4665         /* And unlink it from device chain. */
4666         unlist_netdevice(dev);
4667
4668         dev->reg_state = NETREG_UNREGISTERING;
4669
4670         synchronize_net();
4671
4672         /* Shutdown queueing discipline. */
4673         dev_shutdown(dev);
4674
4675
4676         /* Notify protocols, that we are about to destroy
4677            this device. They should clean all the things.
4678         */
4679         call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
4680
4681         /*
4682          *      Flush the unicast and multicast chains
4683          */
4684         dev_unicast_flush(dev);
4685         dev_addr_discard(dev);
4686
4687         if (dev->netdev_ops->ndo_uninit)
4688                 dev->netdev_ops->ndo_uninit(dev);
4689
4690         /* Notifier chain MUST detach us from master device. */
4691         WARN_ON(dev->master);
4692
4693         /* Remove entries from kobject tree */
4694         netdev_unregister_kobject(dev);
4695
4696         synchronize_net();
4697
4698         dev_put(dev);
4699 }
4700
4701 static void __netdev_init_queue_locks_one(struct net_device *dev,
4702                                           struct netdev_queue *dev_queue,
4703                                           void *_unused)
4704 {
4705         spin_lock_init(&dev_queue->_xmit_lock);
4706         netdev_set_xmit_lockdep_class(&dev_queue->_xmit_lock, dev->type);
4707         dev_queue->xmit_lock_owner = -1;
4708 }
4709
4710 static void netdev_init_queue_locks(struct net_device *dev)
4711 {
4712         netdev_for_each_tx_queue(dev, __netdev_init_queue_locks_one, NULL);
4713         __netdev_init_queue_locks_one(dev, &dev->rx_queue, NULL);
4714 }
4715
4716 unsigned long netdev_fix_features(unsigned long features, const char *name)
4717 {
4718         /* Fix illegal SG+CSUM combinations. */
4719         if ((features & NETIF_F_SG) &&
4720             !(features & NETIF_F_ALL_CSUM)) {
4721                 if (name)
4722                         printk(KERN_NOTICE "%s: Dropping NETIF_F_SG since no "
4723                                "checksum feature.\n", name);
4724                 features &= ~NETIF_F_SG;
4725         }
4726
4727         /* TSO requires that SG is present as well. */
4728         if ((features & NETIF_F_TSO) && !(features & NETIF_F_SG)) {
4729                 if (name)
4730                         printk(KERN_NOTICE "%s: Dropping NETIF_F_TSO since no "
4731                                "SG feature.\n", name);
4732                 features &= ~NETIF_F_TSO;
4733         }
4734
4735         if (features & NETIF_F_UFO) {
4736                 if (!(features & NETIF_F_GEN_CSUM)) {
4737                         if (name)
4738                                 printk(KERN_ERR "%s: Dropping NETIF_F_UFO "
4739                                        "since no NETIF_F_HW_CSUM feature.\n",
4740                                        name);
4741                         features &= ~NETIF_F_UFO;
4742                 }
4743
4744                 if (!(features & NETIF_F_SG)) {
4745                         if (name)
4746                                 printk(KERN_ERR "%s: Dropping NETIF_F_UFO "
4747                                        "since no NETIF_F_SG feature.\n", name);
4748                         features &= ~NETIF_F_UFO;
4749                 }
4750         }
4751
4752         return features;
4753 }
4754 EXPORT_SYMBOL(netdev_fix_features);
4755
4756 /**
4757  *      register_netdevice      - register a network device
4758  *      @dev: device to register
4759  *
4760  *      Take a completed network device structure and add it to the kernel
4761  *      interfaces. A %NETDEV_REGISTER message is sent to the netdev notifier
4762  *      chain. 0 is returned on success. A negative errno code is returned
4763  *      on a failure to set up the device, or if the name is a duplicate.
4764  *
4765  *      Callers must hold the rtnl semaphore. You may want
4766  *      register_netdev() instead of this.
4767  *
4768  *      BUGS:
4769  *      The locking appears insufficient to guarantee two parallel registers
4770  *      will not get the same name.
4771  */
4772
4773 int register_netdevice(struct net_device *dev)
4774 {
4775         struct hlist_head *head;
4776         struct hlist_node *p;
4777         int ret;
4778         struct net *net = dev_net(dev);
4779
4780         BUG_ON(dev_boot_phase);
4781         ASSERT_RTNL();
4782
4783         might_sleep();
4784
4785         /* When net_device's are persistent, this will be fatal. */
4786         BUG_ON(dev->reg_state != NETREG_UNINITIALIZED);
4787         BUG_ON(!net);
4788
4789         spin_lock_init(&dev->addr_list_lock);
4790         netdev_set_addr_lockdep_class(dev);
4791         netdev_init_queue_locks(dev);
4792
4793         dev->iflink = -1;
4794
4795         /* Init, if this function is available */
4796         if (dev->netdev_ops->ndo_init) {
4797                 ret = dev->netdev_ops->ndo_init(dev);
4798                 if (ret) {
4799                         if (ret > 0)
4800                                 ret = -EIO;
4801                         goto out;
4802                 }
4803         }
4804
4805         if (!dev_valid_name(dev->name)) {
4806                 ret = -EINVAL;
4807                 goto err_uninit;
4808         }
4809
4810         dev->ifindex = dev_new_index(net);
4811         if (dev->iflink == -1)
4812                 dev->iflink = dev->ifindex;
4813
4814         /* Check for existence of name */
4815         head = dev_name_hash(net, dev->name);
4816         hlist_for_each(p, head) {
4817                 struct net_device *d
4818                         = hlist_entry(p, struct net_device, name_hlist);
4819                 if (!strncmp(d->name, dev->name, IFNAMSIZ)) {
4820                         ret = -EEXIST;
4821                         goto err_uninit;
4822                 }
4823         }
4824
4825         /* Fix illegal checksum combinations */
4826         if ((dev->features & NETIF_F_HW_CSUM) &&
4827             (dev->features & (NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))) {
4828                 printk(KERN_NOTICE "%s: mixed HW and IP checksum settings.\n",
4829                        dev->name);
4830                 dev->features &= ~(NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM);
4831         }
4832
4833         if ((dev->features & NETIF_F_NO_CSUM) &&
4834             (dev->features & (NETIF_F_HW_CSUM|NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))) {
4835                 printk(KERN_NOTICE "%s: mixed no checksumming and other settings.\n",
4836                        dev->name);
4837                 dev->features &= ~(NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM|NETIF_F_HW_CSUM);
4838         }
4839
4840         dev->features = netdev_fix_features(dev->features, dev->name);
4841
4842         /* Enable software GSO if SG is supported. */
4843         if (dev->features & NETIF_F_SG)
4844                 dev->features |= NETIF_F_GSO;
4845
4846         netdev_initialize_kobject(dev);
4847         ret = netdev_register_kobject(dev);
4848         if (ret)
4849                 goto err_uninit;
4850         dev->reg_state = NETREG_REGISTERED;
4851
4852         /*
4853          *      Default initial state at registry is that the
4854          *      device is present.
4855          */
4856
4857         set_bit(__LINK_STATE_PRESENT, &dev->state);
4858
4859         dev_init_scheduler(dev);
4860         dev_hold(dev);
4861         list_netdevice(dev);
4862
4863         /* Notify protocols, that a new device appeared. */
4864         ret = call_netdevice_notifiers(NETDEV_REGISTER, dev);
4865         ret = notifier_to_errno(ret);
4866         if (ret) {
4867                 rollback_registered(dev);
4868                 dev->reg_state = NETREG_UNREGISTERED;
4869         }
4870         /*
4871          *      Prevent userspace races by waiting until the network
4872          *      device is fully setup before sending notifications.
4873          */
4874         rtmsg_ifinfo(RTM_NEWLINK, dev, ~0U);
4875
4876 out:
4877         return ret;
4878
4879 err_uninit:
4880         if (dev->netdev_ops->ndo_uninit)
4881                 dev->netdev_ops->ndo_uninit(dev);
4882         goto out;
4883 }
4884 EXPORT_SYMBOL(register_netdevice);
4885
4886 /**
4887  *      init_dummy_netdev       - init a dummy network device for NAPI
4888  *      @dev: device to init
4889  *
4890  *      This takes a network device structure and initialize the minimum
4891  *      amount of fields so it can be used to schedule NAPI polls without
4892  *      registering a full blown interface. This is to be used by drivers
4893  *      that need to tie several hardware interfaces to a single NAPI
4894  *      poll scheduler due to HW limitations.
4895  */
4896 int init_dummy_netdev(struct net_device *dev)
4897 {
4898         /* Clear everything. Note we don't initialize spinlocks
4899          * are they aren't supposed to be taken by any of the
4900          * NAPI code and this dummy netdev is supposed to be
4901          * only ever used for NAPI polls
4902          */
4903         memset(dev, 0, sizeof(struct net_device));
4904
4905         /* make sure we BUG if trying to hit standard
4906          * register/unregister code path
4907          */
4908         dev->reg_state = NETREG_DUMMY;
4909
4910         /* initialize the ref count */
4911         atomic_set(&dev->refcnt, 1);
4912
4913         /* NAPI wants this */
4914         INIT_LIST_HEAD(&dev->napi_list);
4915
4916         /* a dummy interface is started by default */
4917         set_bit(__LINK_STATE_PRESENT, &dev->state);
4918         set_bit(__LINK_STATE_START, &dev->state);
4919
4920         return 0;
4921 }
4922 EXPORT_SYMBOL_GPL(init_dummy_netdev);
4923
4924
4925 /**
4926  *      register_netdev - register a network device
4927  *      @dev: device to register
4928  *
4929  *      Take a completed network device structure and add it to the kernel
4930  *      interfaces. A %NETDEV_REGISTER message is sent to the netdev notifier
4931  *      chain. 0 is returned on success. A negative errno code is returned
4932  *      on a failure to set up the device, or if the name is a duplicate.
4933  *
4934  *      This is a wrapper around register_netdevice that takes the rtnl semaphore
4935  *      and expands the device name if you passed a format string to
4936  *      alloc_netdev.
4937  */
4938 int register_netdev(struct net_device *dev)
4939 {
4940         int err;
4941
4942         rtnl_lock();
4943
4944         /*
4945          * If the name is a format string the caller wants us to do a
4946          * name allocation.
4947          */
4948         if (strchr(dev->name, '%')) {
4949                 err = dev_alloc_name(dev, dev->name);
4950                 if (err < 0)
4951                         goto out;
4952         }
4953
4954         err = register_netdevice(dev);
4955 out:
4956         rtnl_unlock();
4957         return err;
4958 }
4959 EXPORT_SYMBOL(register_netdev);
4960
4961 /*
4962  * netdev_wait_allrefs - wait until all references are gone.
4963  *
4964  * This is called when unregistering network devices.
4965  *
4966  * Any protocol or device that holds a reference should register
4967  * for netdevice notification, and cleanup and put back the
4968  * reference if they receive an UNREGISTER event.
4969  * We can get stuck here if buggy protocols don't correctly
4970  * call dev_put.
4971  */
4972 static void netdev_wait_allrefs(struct net_device *dev)
4973 {
4974         unsigned long rebroadcast_time, warning_time;
4975
4976         rebroadcast_time = warning_time = jiffies;
4977         while (atomic_read(&dev->refcnt) != 0) {
4978                 if (time_after(jiffies, rebroadcast_time + 1 * HZ)) {
4979                         rtnl_lock();
4980
4981                         /* Rebroadcast unregister notification */
4982                         call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
4983
4984                         if (test_bit(__LINK_STATE_LINKWATCH_PENDING,
4985                                      &dev->state)) {
4986                                 /* We must not have linkwatch events
4987                                  * pending on unregister. If this
4988                                  * happens, we simply run the queue
4989                                  * unscheduled, resulting in a noop
4990                                  * for this device.
4991                                  */
4992                                 linkwatch_run_queue();
4993                         }
4994
4995                         __rtnl_unlock();
4996
4997                         rebroadcast_time = jiffies;
4998                 }
4999
5000                 msleep(250);
5001
5002                 if (time_after(jiffies, warning_time + 10 * HZ)) {
5003                         printk(KERN_EMERG "unregister_netdevice: "
5004                                "waiting for %s to become free. Usage "
5005                                "count = %d\n",
5006                                dev->name, atomic_read(&dev->refcnt));
5007                         warning_time = jiffies;
5008                 }
5009         }
5010 }
5011
5012 /* The sequence is:
5013  *
5014  *      rtnl_lock();
5015  *      ...
5016  *      register_netdevice(x1);
5017  *      register_netdevice(x2);
5018  *      ...
5019  *      unregister_netdevice(y1);
5020  *      unregister_netdevice(y2);
5021  *      ...
5022  *      rtnl_unlock();
5023  *      free_netdev(y1);
5024  *      free_netdev(y2);
5025  *
5026  * We are invoked by rtnl_unlock().
5027  * This allows us to deal with problems:
5028  * 1) We can delete sysfs objects which invoke hotplug
5029  *    without deadlocking with linkwatch via keventd.
5030  * 2) Since we run with the RTNL semaphore not held, we can sleep
5031  *    safely in order to wait for the netdev refcnt to drop to zero.
5032  *
5033  * We must not return until all unregister events added during
5034  * the interval the lock was held have been completed.
5035  */
5036 void netdev_run_todo(void)
5037 {
5038         struct list_head list;
5039
5040         /* Snapshot list, allow later requests */
5041         list_replace_init(&net_todo_list, &list);
5042
5043         __rtnl_unlock();
5044
5045         while (!list_empty(&list)) {
5046                 struct net_device *dev
5047                         = list_entry(list.next, struct net_device, todo_list);
5048                 list_del(&dev->todo_list);
5049
5050                 if (unlikely(dev->reg_state != NETREG_UNREGISTERING)) {
5051                         printk(KERN_ERR "network todo '%s' but state %d\n",
5052                                dev->name, dev->reg_state);
5053                         dump_stack();
5054                         continue;
5055                 }
5056
5057                 dev->reg_state = NETREG_UNREGISTERED;
5058
5059                 on_each_cpu(flush_backlog, dev, 1);
5060
5061                 netdev_wait_allrefs(dev);
5062
5063                 /* paranoia */
5064                 BUG_ON(atomic_read(&dev->refcnt));
5065                 WARN_ON(dev->ip_ptr);
5066                 WARN_ON(dev->ip6_ptr);
5067                 WARN_ON(dev->dn_ptr);
5068
5069                 if (dev->destructor)
5070                         dev->destructor(dev);
5071
5072                 /* Free network device */
5073                 kobject_put(&dev->dev.kobj);
5074         }
5075 }
5076
5077 /**
5078  *      dev_get_stats   - get network device statistics
5079  *      @dev: device to get statistics from
5080  *
5081  *      Get network statistics from device. The device driver may provide
5082  *      its own method by setting dev->netdev_ops->get_stats; otherwise
5083  *      the internal statistics structure is used.
5084  */
5085 const struct net_device_stats *dev_get_stats(struct net_device *dev)
5086 {
5087         const struct net_device_ops *ops = dev->netdev_ops;
5088
5089         if (ops->ndo_get_stats)
5090                 return ops->ndo_get_stats(dev);
5091         else {
5092                 unsigned long tx_bytes = 0, tx_packets = 0, tx_dropped = 0;
5093                 struct net_device_stats *stats = &dev->stats;
5094                 unsigned int i;
5095                 struct netdev_queue *txq;
5096
5097                 for (i = 0; i < dev->num_tx_queues; i++) {
5098                         txq = netdev_get_tx_queue(dev, i);
5099                         tx_bytes   += txq->tx_bytes;
5100                         tx_packets += txq->tx_packets;
5101                         tx_dropped += txq->tx_dropped;
5102                 }
5103                 if (tx_bytes || tx_packets || tx_dropped) {
5104                         stats->tx_bytes   = tx_bytes;
5105                         stats->tx_packets = tx_packets;
5106                         stats->tx_dropped = tx_dropped;
5107                 }
5108                 return stats;
5109         }
5110 }
5111 EXPORT_SYMBOL(dev_get_stats);
5112
5113 static void netdev_init_one_queue(struct net_device *dev,
5114                                   struct netdev_queue *queue,
5115                                   void *_unused)
5116 {
5117         queue->dev = dev;
5118 }
5119
5120 static void netdev_init_queues(struct net_device *dev)
5121 {
5122         netdev_init_one_queue(dev, &dev->rx_queue, NULL);
5123         netdev_for_each_tx_queue(dev, netdev_init_one_queue, NULL);
5124         spin_lock_init(&dev->tx_global_lock);
5125 }
5126
5127 /**
5128  *      alloc_netdev_mq - allocate network device
5129  *      @sizeof_priv:   size of private data to allocate space for
5130  *      @name:          device name format string
5131  *      @setup:         callback to initialize device
5132  *      @queue_count:   the number of subqueues to allocate
5133  *
5134  *      Allocates a struct net_device with private data area for driver use
5135  *      and performs basic initialization.  Also allocates subquue structs
5136  *      for each queue on the device at the end of the netdevice.
5137  */
5138 struct net_device *alloc_netdev_mq(int sizeof_priv, const char *name,
5139                 void (*setup)(struct net_device *), unsigned int queue_count)
5140 {
5141         struct netdev_queue *tx;
5142         struct net_device *dev;
5143         size_t alloc_size;
5144         struct net_device *p;
5145
5146         BUG_ON(strlen(name) >= sizeof(dev->name));
5147
5148         alloc_size = sizeof(struct net_device);
5149         if (sizeof_priv) {
5150                 /* ensure 32-byte alignment of private area */
5151                 alloc_size = ALIGN(alloc_size, NETDEV_ALIGN);
5152                 alloc_size += sizeof_priv;
5153         }
5154         /* ensure 32-byte alignment of whole construct */
5155         alloc_size += NETDEV_ALIGN - 1;
5156
5157         p = kzalloc(alloc_size, GFP_KERNEL);
5158         if (!p) {
5159                 printk(KERN_ERR "alloc_netdev: Unable to allocate device.\n");
5160                 return NULL;
5161         }
5162
5163         tx = kcalloc(queue_count, sizeof(struct netdev_queue), GFP_KERNEL);
5164         if (!tx) {
5165                 printk(KERN_ERR "alloc_netdev: Unable to allocate "
5166                        "tx qdiscs.\n");
5167                 goto free_p;
5168         }
5169
5170         dev = PTR_ALIGN(p, NETDEV_ALIGN);
5171         dev->padded = (char *)dev - (char *)p;
5172
5173         if (dev_addr_init(dev))
5174                 goto free_tx;
5175
5176         dev_unicast_init(dev);
5177
5178         dev_net_set(dev, &init_net);
5179
5180         dev->_tx = tx;
5181         dev->num_tx_queues = queue_count;
5182         dev->real_num_tx_queues = queue_count;
5183
5184         dev->gso_max_size = GSO_MAX_SIZE;
5185
5186         netdev_init_queues(dev);
5187
5188         INIT_LIST_HEAD(&dev->napi_list);
5189         dev->priv_flags = IFF_XMIT_DST_RELEASE;
5190         setup(dev);
5191         strcpy(dev->name, name);
5192         return dev;
5193
5194 free_tx:
5195         kfree(tx);
5196
5197 free_p:
5198         kfree(p);
5199         return NULL;
5200 }
5201 EXPORT_SYMBOL(alloc_netdev_mq);
5202
5203 /**
5204  *      free_netdev - free network device
5205  *      @dev: device
5206  *
5207  *      This function does the last stage of destroying an allocated device
5208  *      interface. The reference to the device object is released.
5209  *      If this is the last reference then it will be freed.
5210  */
5211 void free_netdev(struct net_device *dev)
5212 {
5213         struct napi_struct *p, *n;
5214
5215         release_net(dev_net(dev));
5216
5217         kfree(dev->_tx);
5218
5219         /* Flush device addresses */
5220         dev_addr_flush(dev);
5221
5222         list_for_each_entry_safe(p, n, &dev->napi_list, dev_list)
5223                 netif_napi_del(p);
5224
5225         /*  Compatibility with error handling in drivers */
5226         if (dev->reg_state == NETREG_UNINITIALIZED) {
5227                 kfree((char *)dev - dev->padded);
5228                 return;
5229         }
5230
5231         BUG_ON(dev->reg_state != NETREG_UNREGISTERED);
5232         dev->reg_state = NETREG_RELEASED;
5233
5234         /* will free via device release */
5235         put_device(&dev->dev);
5236 }
5237 EXPORT_SYMBOL(free_netdev);
5238
5239 /**
5240  *      synchronize_net -  Synchronize with packet receive processing
5241  *
5242  *      Wait for packets currently being received to be done.
5243  *      Does not block later packets from starting.
5244  */
5245 void synchronize_net(void)
5246 {
5247         might_sleep();
5248         synchronize_rcu();
5249 }
5250 EXPORT_SYMBOL(synchronize_net);
5251
5252 /**
5253  *      unregister_netdevice - remove device from the kernel
5254  *      @dev: device
5255  *
5256  *      This function shuts down a device interface and removes it
5257  *      from the kernel tables.
5258  *
5259  *      Callers must hold the rtnl semaphore.  You may want
5260  *      unregister_netdev() instead of this.
5261  */
5262
5263 void unregister_netdevice(struct net_device *dev)
5264 {
5265         ASSERT_RTNL();
5266
5267         rollback_registered(dev);
5268         /* Finish processing unregister after unlock */
5269         net_set_todo(dev);
5270 }
5271 EXPORT_SYMBOL(unregister_netdevice);
5272
5273 /**
5274  *      unregister_netdev - remove device from the kernel
5275  *      @dev: device
5276  *
5277  *      This function shuts down a device interface and removes it
5278  *      from the kernel tables.
5279  *
5280  *      This is just a wrapper for unregister_netdevice that takes
5281  *      the rtnl semaphore.  In general you want to use this and not
5282  *      unregister_netdevice.
5283  */
5284 void unregister_netdev(struct net_device *dev)
5285 {
5286         rtnl_lock();
5287         unregister_netdevice(dev);
5288         rtnl_unlock();
5289 }
5290 EXPORT_SYMBOL(unregister_netdev);
5291
5292 /**
5293  *      dev_change_net_namespace - move device to different nethost namespace
5294  *      @dev: device
5295  *      @net: network namespace
5296  *      @pat: If not NULL name pattern to try if the current device name
5297  *            is already taken in the destination network namespace.
5298  *
5299  *      This function shuts down a device interface and moves it
5300  *      to a new network namespace. On success 0 is returned, on
5301  *      a failure a netagive errno code is returned.
5302  *
5303  *      Callers must hold the rtnl semaphore.
5304  */
5305
5306 int dev_change_net_namespace(struct net_device *dev, struct net *net, const char *pat)
5307 {
5308         char buf[IFNAMSIZ];
5309         const char *destname;
5310         int err;
5311
5312         ASSERT_RTNL();
5313
5314         /* Don't allow namespace local devices to be moved. */
5315         err = -EINVAL;
5316         if (dev->features & NETIF_F_NETNS_LOCAL)
5317                 goto out;
5318
5319 #ifdef CONFIG_SYSFS
5320         /* Don't allow real devices to be moved when sysfs
5321          * is enabled.
5322          */
5323         err = -EINVAL;
5324         if (dev->dev.parent)
5325                 goto out;
5326 #endif
5327
5328         /* Ensure the device has been registrered */
5329         err = -EINVAL;
5330         if (dev->reg_state != NETREG_REGISTERED)
5331                 goto out;
5332
5333         /* Get out if there is nothing todo */
5334         err = 0;
5335         if (net_eq(dev_net(dev), net))
5336                 goto out;
5337
5338         /* Pick the destination device name, and ensure
5339          * we can use it in the destination network namespace.
5340          */
5341         err = -EEXIST;
5342         destname = dev->name;
5343         if (__dev_get_by_name(net, destname)) {
5344                 /* We get here if we can't use the current device name */
5345                 if (!pat)
5346                         goto out;
5347                 if (!dev_valid_name(pat))
5348                         goto out;
5349                 if (strchr(pat, '%')) {
5350                         if (__dev_alloc_name(net, pat, buf) < 0)
5351                                 goto out;
5352                         destname = buf;
5353                 } else
5354                         destname = pat;
5355                 if (__dev_get_by_name(net, destname))
5356                         goto out;
5357         }
5358
5359         /*
5360          * And now a mini version of register_netdevice unregister_netdevice.
5361          */
5362
5363         /* If device is running close it first. */
5364         dev_close(dev);
5365
5366         /* And unlink it from device chain */
5367         err = -ENODEV;
5368         unlist_netdevice(dev);
5369
5370         synchronize_net();
5371
5372         /* Shutdown queueing discipline. */
5373         dev_shutdown(dev);
5374
5375         /* Notify protocols, that we are about to destroy
5376            this device. They should clean all the things.
5377         */
5378         call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
5379
5380         /*
5381          *      Flush the unicast and multicast chains
5382          */
5383         dev_unicast_flush(dev);
5384         dev_addr_discard(dev);
5385
5386         netdev_unregister_kobject(dev);
5387
5388         /* Actually switch the network namespace */
5389         dev_net_set(dev, net);
5390
5391         /* Assign the new device name */
5392         if (destname != dev->name)
5393                 strcpy(dev->name, destname);
5394
5395         /* If there is an ifindex conflict assign a new one */
5396         if (__dev_get_by_index(net, dev->ifindex)) {
5397                 int iflink = (dev->iflink == dev->ifindex);
5398                 dev->ifindex = dev_new_index(net);
5399                 if (iflink)
5400                         dev->iflink = dev->ifindex;
5401         }
5402
5403         /* Fixup kobjects */
5404         err = netdev_register_kobject(dev);
5405         WARN_ON(err);
5406
5407         /* Add the device back in the hashes */
5408         list_netdevice(dev);
5409
5410         /* Notify protocols, that a new device appeared. */
5411         call_netdevice_notifiers(NETDEV_REGISTER, dev);
5412
5413         /*
5414          *      Prevent userspace races by waiting until the network
5415          *      device is fully setup before sending notifications.
5416          */
5417         rtmsg_ifinfo(RTM_NEWLINK, dev, ~0U);
5418
5419         synchronize_net();
5420         err = 0;
5421 out:
5422         return err;
5423 }
5424 EXPORT_SYMBOL_GPL(dev_change_net_namespace);
5425
5426 static int dev_cpu_callback(struct notifier_block *nfb,
5427                             unsigned long action,
5428                             void *ocpu)
5429 {
5430         struct sk_buff **list_skb;
5431         struct Qdisc **list_net;
5432         struct sk_buff *skb;
5433         unsigned int cpu, oldcpu = (unsigned long)ocpu;
5434         struct softnet_data *sd, *oldsd;
5435
5436         if (action != CPU_DEAD && action != CPU_DEAD_FROZEN)
5437                 return NOTIFY_OK;
5438
5439         local_irq_disable();
5440         cpu = smp_processor_id();
5441         sd = &per_cpu(softnet_data, cpu);
5442         oldsd = &per_cpu(softnet_data, oldcpu);
5443
5444         /* Find end of our completion_queue. */
5445         list_skb = &sd->completion_queue;
5446         while (*list_skb)
5447                 list_skb = &(*list_skb)->next;
5448         /* Append completion queue from offline CPU. */
5449         *list_skb = oldsd->completion_queue;
5450         oldsd->completion_queue = NULL;
5451
5452         /* Find end of our output_queue. */
5453         list_net = &sd->output_queue;
5454         while (*list_net)
5455                 list_net = &(*list_net)->next_sched;
5456         /* Append output queue from offline CPU. */
5457         *list_net = oldsd->output_queue;
5458         oldsd->output_queue = NULL;
5459
5460         raise_softirq_irqoff(NET_TX_SOFTIRQ);
5461         local_irq_enable();
5462
5463         /* Process offline CPU's input_pkt_queue */
5464         while ((skb = __skb_dequeue(&oldsd->input_pkt_queue)))
5465                 netif_rx(skb);
5466
5467         return NOTIFY_OK;
5468 }
5469
5470
5471 /**
5472  *      netdev_increment_features - increment feature set by one
5473  *      @all: current feature set
5474  *      @one: new feature set
5475  *      @mask: mask feature set
5476  *
5477  *      Computes a new feature set after adding a device with feature set
5478  *      @one to the master device with current feature set @all.  Will not
5479  *      enable anything that is off in @mask. Returns the new feature set.
5480  */
5481 unsigned long netdev_increment_features(unsigned long all, unsigned long one,
5482                                         unsigned long mask)
5483 {
5484         /* If device needs checksumming, downgrade to it. */
5485         if (all & NETIF_F_NO_CSUM && !(one & NETIF_F_NO_CSUM))
5486                 all ^= NETIF_F_NO_CSUM | (one & NETIF_F_ALL_CSUM);
5487         else if (mask & NETIF_F_ALL_CSUM) {
5488                 /* If one device supports v4/v6 checksumming, set for all. */
5489                 if (one & (NETIF_F_IP_CSUM | NETIF_F_IPV6_CSUM) &&
5490                     !(all & NETIF_F_GEN_CSUM)) {
5491                         all &= ~NETIF_F_ALL_CSUM;
5492                         all |= one & (NETIF_F_IP_CSUM | NETIF_F_IPV6_CSUM);
5493                 }
5494
5495                 /* If one device supports hw checksumming, set for all. */
5496                 if (one & NETIF_F_GEN_CSUM && !(all & NETIF_F_GEN_CSUM)) {
5497                         all &= ~NETIF_F_ALL_CSUM;
5498                         all |= NETIF_F_HW_CSUM;
5499                 }
5500         }
5501
5502         one |= NETIF_F_ALL_CSUM;
5503
5504         one |= all & NETIF_F_ONE_FOR_ALL;
5505         all &= one | NETIF_F_LLTX | NETIF_F_GSO;
5506         all |= one & mask & NETIF_F_ONE_FOR_ALL;
5507
5508         return all;
5509 }
5510 EXPORT_SYMBOL(netdev_increment_features);
5511
5512 static struct hlist_head *netdev_create_hash(void)
5513 {
5514         int i;
5515         struct hlist_head *hash;
5516
5517         hash = kmalloc(sizeof(*hash) * NETDEV_HASHENTRIES, GFP_KERNEL);
5518         if (hash != NULL)
5519                 for (i = 0; i < NETDEV_HASHENTRIES; i++)
5520                         INIT_HLIST_HEAD(&hash[i]);
5521
5522         return hash;
5523 }
5524
5525 /* Initialize per network namespace state */
5526 static int __net_init netdev_init(struct net *net)
5527 {
5528         INIT_LIST_HEAD(&net->dev_base_head);
5529
5530         net->dev_name_head = netdev_create_hash();
5531         if (net->dev_name_head == NULL)
5532                 goto err_name;
5533
5534         net->dev_index_head = netdev_create_hash();
5535         if (net->dev_index_head == NULL)
5536                 goto err_idx;
5537
5538         return 0;
5539
5540 err_idx:
5541         kfree(net->dev_name_head);
5542 err_name:
5543         return -ENOMEM;
5544 }
5545
5546 /**
5547  *      netdev_drivername - network driver for the device
5548  *      @dev: network device
5549  *      @buffer: buffer for resulting name
5550  *      @len: size of buffer
5551  *
5552  *      Determine network driver for device.
5553  */
5554 char *netdev_drivername(const struct net_device *dev, char *buffer, int len)
5555 {
5556         const struct device_driver *driver;
5557         const struct device *parent;
5558
5559         if (len <= 0 || !buffer)
5560                 return buffer;
5561         buffer[0] = 0;
5562
5563         parent = dev->dev.parent;
5564
5565         if (!parent)
5566                 return buffer;
5567
5568         driver = parent->driver;
5569         if (driver && driver->name)
5570                 strlcpy(buffer, driver->name, len);
5571         return buffer;
5572 }
5573
5574 static void __net_exit netdev_exit(struct net *net)
5575 {
5576         kfree(net->dev_name_head);
5577         kfree(net->dev_index_head);
5578 }
5579
5580 static struct pernet_operations __net_initdata netdev_net_ops = {
5581         .init = netdev_init,
5582         .exit = netdev_exit,
5583 };
5584
5585 static void __net_exit default_device_exit(struct net *net)
5586 {
5587         struct net_device *dev;
5588         /*
5589          * Push all migratable of the network devices back to the
5590          * initial network namespace
5591          */
5592         rtnl_lock();
5593 restart:
5594         for_each_netdev(net, dev) {
5595                 int err;
5596                 char fb_name[IFNAMSIZ];
5597
5598                 /* Ignore unmoveable devices (i.e. loopback) */
5599                 if (dev->features & NETIF_F_NETNS_LOCAL)
5600                         continue;
5601
5602                 /* Delete virtual devices */
5603                 if (dev->rtnl_link_ops && dev->rtnl_link_ops->dellink) {
5604                         dev->rtnl_link_ops->dellink(dev);
5605                         goto restart;
5606                 }
5607
5608                 /* Push remaing network devices to init_net */
5609                 snprintf(fb_name, IFNAMSIZ, "dev%d", dev->ifindex);
5610                 err = dev_change_net_namespace(dev, &init_net, fb_name);
5611                 if (err) {
5612                         printk(KERN_EMERG "%s: failed to move %s to init_net: %d\n",
5613                                 __func__, dev->name, err);
5614                         BUG();
5615                 }
5616                 goto restart;
5617         }
5618         rtnl_unlock();
5619 }
5620
5621 static struct pernet_operations __net_initdata default_device_ops = {
5622         .exit = default_device_exit,
5623 };
5624
5625 /*
5626  *      Initialize the DEV module. At boot time this walks the device list and
5627  *      unhooks any devices that fail to initialise (normally hardware not
5628  *      present) and leaves us with a valid list of present and active devices.
5629  *
5630  */
5631
5632 /*
5633  *       This is called single threaded during boot, so no need
5634  *       to take the rtnl semaphore.
5635  */
5636 static int __init net_dev_init(void)
5637 {
5638         int i, rc = -ENOMEM;
5639
5640         BUG_ON(!dev_boot_phase);
5641
5642         if (dev_proc_init())
5643                 goto out;
5644
5645         if (netdev_kobject_init())
5646                 goto out;
5647
5648         INIT_LIST_HEAD(&ptype_all);
5649         for (i = 0; i < PTYPE_HASH_SIZE; i++)
5650                 INIT_LIST_HEAD(&ptype_base[i]);
5651
5652         if (register_pernet_subsys(&netdev_net_ops))
5653                 goto out;
5654
5655         /*
5656          *      Initialise the packet receive queues.
5657          */
5658
5659         for_each_possible_cpu(i) {
5660                 struct softnet_data *queue;
5661
5662                 queue = &per_cpu(softnet_data, i);
5663                 skb_queue_head_init(&queue->input_pkt_queue);
5664                 queue->completion_queue = NULL;
5665                 INIT_LIST_HEAD(&queue->poll_list);
5666
5667                 queue->backlog.poll = process_backlog;
5668                 queue->backlog.weight = weight_p;
5669                 queue->backlog.gro_list = NULL;
5670                 queue->backlog.gro_count = 0;
5671         }
5672
5673         dev_boot_phase = 0;
5674
5675         /* The loopback device is special if any other network devices
5676          * is present in a network namespace the loopback device must
5677          * be present. Since we now dynamically allocate and free the
5678          * loopback device ensure this invariant is maintained by
5679          * keeping the loopback device as the first device on the
5680          * list of network devices.  Ensuring the loopback devices
5681          * is the first device that appears and the last network device
5682          * that disappears.
5683          */
5684         if (register_pernet_device(&loopback_net_ops))
5685                 goto out;
5686
5687         if (register_pernet_device(&default_device_ops))
5688                 goto out;
5689
5690         open_softirq(NET_TX_SOFTIRQ, net_tx_action);
5691         open_softirq(NET_RX_SOFTIRQ, net_rx_action);
5692
5693         hotcpu_notifier(dev_cpu_callback, 0);
5694         dst_init();
5695         dev_mcast_init();
5696         rc = 0;
5697 out:
5698         return rc;
5699 }
5700
5701 subsys_initcall(net_dev_init);
5702
5703 static int __init initialize_hashrnd(void)
5704 {
5705         get_random_bytes(&skb_tx_hashrnd, sizeof(skb_tx_hashrnd));
5706         return 0;
5707 }
5708
5709 late_initcall_sync(initialize_hashrnd);
5710