mm/slqb.c

   1 /*
   2  * SLQB: A slab allocator that focuses on per-CPU scaling, and good performance
   3  * with order-0 allocations. Fastpaths emphasis is placed on local allocaiton
   4  * and freeing, but with a secondary goal of good remote freeing (freeing on
   5  * another CPU from that which allocated).
   6  *
   7  * Using ideas and code from mm/slab.c, mm/slob.c, and mm/slub.c.
   8  */
   9
  10 #include <linux/mm.h>
  11 #include <linux/swap.h> /* struct reclaim_state */
  12 #include <linux/module.h>
  13 #include <linux/interrupt.h>
  14 #include <linux/slab.h>
  15 #include <linux/seq_file.h>
  16 #include <linux/cpu.h>
  17 #include <linux/cpuset.h>
  18 #include <linux/mempolicy.h>
  19 #include <linux/ctype.h>
  20 #include <linux/kallsyms.h>
  21 #include <linux/memory.h>
  22 #include <linux/fault-inject.h>
  23
  24 /*
  25  * TODO
  26  * - fix up releasing of offlined data structures. Not a big deal because
  27  *   they don't get cumulatively leaked with successive online/offline cycles
  28  * - allow OOM conditions to flush back per-CPU pages to common lists to be
  29  *   reused by other CPUs.
  30  * - investiage performance with memoryless nodes. Perhaps CPUs can be given
  31  *   a default closest home node via which it can use fastpath functions.
  32  *   Perhaps it is not a big problem.
  33  */
  34
  35 /*
  36  * slqb_page overloads struct page, and is used to manage some slob allocation
  37  * aspects, however to avoid the horrible mess in include/linux/mm_types.h,
  38  * we'll just define our own struct slqb_page type variant here.
  39  */
  40 struct slqb_page {
  41         union {
  42                 struct {
  43                         unsigned long   flags;          /* mandatory */
  44                         atomic_t        _count;         /* mandatory */
  45                         unsigned int    inuse;          /* Nr of objects */
  46                         struct kmem_cache_list *list;   /* Pointer to list */
  47                         void             **freelist;    /* LIFO freelist */
  48                         union {
  49                                 struct list_head lru;   /* misc. list */
  50                                 struct rcu_head rcu_head; /* for rcu freeing */
  51                         };
  52                 };
  53                 struct page page;
  54         };
  55 };
  56 static inline void struct_slqb_page_wrong_size(void)
  57 { BUILD_BUG_ON(sizeof(struct slqb_page) != sizeof(struct page)); }
  58
  59 #define PG_SLQB_BIT (1 << PG_slab)
  60
  61 /*
  62  * slqb_min_order: minimum allocation order for slabs
  63  */
  64 static int slqb_min_order;
  65
  66 /*
  67  * slqb_min_objects: minimum number of objects per slab. Increasing this
  68  * will increase the allocation order for slabs with larger objects
  69  */
  70 static int slqb_min_objects = 1;
  71
  72 #ifdef CONFIG_NUMA
  73 static inline int slab_numa(struct kmem_cache *s)
  74 {
  75         return s->flags & SLAB_NUMA;
  76 }
  77 #else
  78 static inline int slab_numa(struct kmem_cache *s)
  79 {
  80         return 0;
  81 }
  82 #endif
  83
  84 static inline int slab_hiwater(struct kmem_cache *s)
  85 {
  86         return s->hiwater;
  87 }
  88
  89 static inline int slab_freebatch(struct kmem_cache *s)
  90 {
  91         return s->freebatch;
  92 }
  93
  94 /*
  95  * Lock order:
  96  * kmem_cache_node->list_lock
  97  *   kmem_cache_remote_free->lock
  98  *
  99  * Data structures:
 100  * SLQB is primarily per-cpu. For each kmem_cache, each CPU has:
 101  *
 102  * - A LIFO list of node-local objects. Allocation and freeing of node local
 103  *   objects goes first to this list.
 104  *
 105  * - 2 Lists of slab pages, free and partial pages. If an allocation misses
 106  *   the object list, it tries from the partial list, then the free list.
 107  *   After freeing an object to the object list, if it is over a watermark,
 108  *   some objects are freed back to pages. If an allocation misses these lists,
 109  *   a new slab page is allocated from the page allocator. If the free list
 110  *   reaches a watermark, some of its pages are returned to the page allocator.
 111  *
 112  * - A remote free queue, where objects freed that did not come from the local
 113  *   node are queued to. When this reaches a watermark, the objects are
 114  *   flushed.
 115  *
 116  * - A remotely freed queue, where objects allocated from this CPU are flushed
 117  *   to from other CPUs' remote free queues. kmem_cache_remote_free->lock is
 118  *   used to protect access to this queue.
 119  *
 120  *   When the remotely freed queue reaches a watermark, a flag is set to tell
 121  *   the owner CPU to check it. The owner CPU will then check the queue on the
 122  *   next allocation that misses the object list. It will move all objects from
 123  *   this list onto the object list and then allocate one.
 124  *
 125  *   This system of remote queueing is intended to reduce lock and remote
 126  *   cacheline acquisitions, and give a cooling off period for remotely freed
 127  *   objects before they are re-allocated.
 128  *
 129  * node specific allocations from somewhere other than the local node are
 130  * handled by a per-node list which is the same as the above per-CPU data
 131  * structures except for the following differences:
 132  *
 133  * - kmem_cache_node->list_lock is used to protect access for multiple CPUs to
 134  *   allocate from a given node.
 135  *
 136  * - There is no remote free queue. Nodes don't free objects, CPUs do.
 137  */
 138
 139 static inline void slqb_stat_inc(struct kmem_cache_list *list,
 140                                 enum stat_item si)
 141 {
 142 #ifdef CONFIG_SLQB_STATS
 143         list->stats[si]++;
 144 #endif
 145 }
 146
 147 static inline void slqb_stat_add(struct kmem_cache_list *list,
 148                                 enum stat_item si, unsigned long nr)
 149 {
 150 #ifdef CONFIG_SLQB_STATS
 151         list->stats[si] += nr;
 152 #endif
 153 }
 154
 155 static inline int slqb_page_to_nid(struct slqb_page *page)
 156 {
 157         return page_to_nid(&page->page);
 158 }
 159
 160 static inline void *slqb_page_address(struct slqb_page *page)
 161 {
 162         return page_address(&page->page);
 163 }
 164
 165 static inline struct zone *slqb_page_zone(struct slqb_page *page)
 166 {
 167         return page_zone(&page->page);
 168 }
 169
 170 static inline int virt_to_nid(const void *addr)
 171 {
 172         return page_to_nid(virt_to_page(addr));
 173 }
 174
 175 static inline struct slqb_page *virt_to_head_slqb_page(const void *addr)
 176 {
 177         struct page *p;
 178
 179         p = virt_to_head_page(addr);
 180         return (struct slqb_page *)p;
 181 }
 182
 183 static inline void __free_slqb_pages(struct slqb_page *page, unsigned int order,
 184                                         int pages)
 185 {
 186         struct page *p = &page->page;
 187
 188         reset_page_mapcount(p);
 189         p->mapping = NULL;
 190         VM_BUG_ON(!(p->flags & PG_SLQB_BIT));
 191         p->flags &= ~PG_SLQB_BIT;
 192
 193         if (current->reclaim_state)
 194                 current->reclaim_state->reclaimed_slab += pages;
 195         __free_pages(p, order);
 196 }
 197
 198 #ifdef CONFIG_SLQB_DEBUG
 199 static inline int slab_debug(struct kmem_cache *s)
 200 {
 201         return s->flags &
 202                         (SLAB_DEBUG_FREE |
 203                          SLAB_RED_ZONE |
 204                          SLAB_POISON |
 205                          SLAB_STORE_USER |
 206                          SLAB_TRACE);
 207 }
 208 static inline int slab_poison(struct kmem_cache *s)
 209 {
 210         return s->flags & SLAB_POISON;
 211 }
 212 #else
 213 static inline int slab_debug(struct kmem_cache *s)
 214 {
 215         return 0;
 216 }
 217 static inline int slab_poison(struct kmem_cache *s)
 218 {
 219         return 0;
 220 }
 221 #endif
 222
 223 #define DEBUG_DEFAULT_FLAGS (SLAB_DEBUG_FREE | SLAB_RED_ZONE | \
 224                                 SLAB_POISON | SLAB_STORE_USER)
 225
 226 /* Internal SLQB flags */
 227 #define __OBJECT_POISON         0x80000000 /* Poison object */
 228
 229 /* Not all arches define cache_line_size */
 230 #ifndef cache_line_size
 231 #define cache_line_size()       L1_CACHE_BYTES
 232 #endif
 233
 234 #ifdef CONFIG_SMP
 235 static struct notifier_block slab_notifier;
 236 #endif
 237
 238 /*
 239  * slqb_lock protects slab_caches list and serialises hotplug operations.
 240  * hotplug operations take lock for write, other operations can hold off
 241  * hotplug by taking it for read (or write).
 242  */
 243 static DECLARE_RWSEM(slqb_lock);
 244
 245 /*
 246  * A list of all slab caches on the system
 247  */
 248 static LIST_HEAD(slab_caches);
 249
 250 /*
 251  * Tracking user of a slab.
 252  */
 253 struct track {
 254         unsigned long addr;     /* Called from address */
 255         int cpu;                /* Was running on cpu */
 256         int pid;                /* Pid context */
 257         unsigned long when;     /* When did the operation occur */
 258 };
 259
 260 enum track_item { TRACK_ALLOC, TRACK_FREE };
 261
 262 static struct kmem_cache kmem_cache_cache;
 263
 264 #ifdef CONFIG_SLQB_SYSFS
 265 static int sysfs_slab_add(struct kmem_cache *s);
 266 static void sysfs_slab_remove(struct kmem_cache *s);
 267 #else
 268 static inline int sysfs_slab_add(struct kmem_cache *s)
 269 {
 270         return 0;
 271 }
 272 static inline void sysfs_slab_remove(struct kmem_cache *s)
 273 {
 274         kmem_cache_free(&kmem_cache_cache, s);
 275 }
 276 #endif
 277
 278 /********************************************************************
 279  *                      Core slab cache functions
 280  *******************************************************************/
 281
 282 static int __slab_is_available __read_mostly;
 283 int slab_is_available(void)
 284 {
 285         return __slab_is_available;
 286 }
 287
 288 static inline struct kmem_cache_cpu *get_cpu_slab(struct kmem_cache *s, int cpu)
 289 {
 290 #ifdef CONFIG_SMP
 291         VM_BUG_ON(!s->cpu_slab[cpu]);
 292         return s->cpu_slab[cpu];
 293 #else
 294         return &s->cpu_slab;
 295 #endif
 296 }
 297
 298 static inline int check_valid_pointer(struct kmem_cache *s,
 299                                 struct slqb_page *page, const void *object)
 300 {
 301         void *base;
 302
 303         base = slqb_page_address(page);
 304         if (object < base || object >= base + s->objects * s->size ||
 305                 (object - base) % s->size) {
 306                 return 0;
 307         }
 308
 309         return 1;
 310 }
 311
 312 static inline void *get_freepointer(struct kmem_cache *s, void *object)
 313 {
 314         return *(void **)(object + s->offset);
 315 }
 316
 317 static inline void set_freepointer(struct kmem_cache *s, void *object, void *fp)
 318 {
 319         *(void **)(object + s->offset) = fp;
 320 }
 321
 322 /* Loop over all objects in a slab */
 323 #define for_each_object(__p, __s, __addr) \
 324         for (__p = (__addr); __p < (__addr) + (__s)->objects * (__s)->size;\
 325                         __p += (__s)->size)
 326
 327 /* Scan freelist */
 328 #define for_each_free_object(__p, __s, __free) \
 329         for (__p = (__free); (__p) != NULL; __p = get_freepointer((__s),\
 330                 __p))
 331
 332 #ifdef CONFIG_SLQB_DEBUG
 333 /*
 334  * Debug settings:
 335  */
 336 #ifdef CONFIG_SLQB_DEBUG_ON
 337 static int slqb_debug __read_mostly = DEBUG_DEFAULT_FLAGS;
 338 #else
 339 static int slqb_debug __read_mostly;
 340 #endif
 341
 342 static char *slqb_debug_slabs;
 343
 344 /*
 345  * Object debugging
 346  */
 347 static void print_section(char *text, u8 *addr, unsigned int length)
 348 {
 349         int i, offset;
 350         int newline = 1;
 351         char ascii[17];
 352
 353         ascii[16] = 0;
 354
 355         for (i = 0; i < length; i++) {
 356                 if (newline) {
 357                         printk(KERN_ERR "%8s 0x%p: ", text, addr + i);
 358                         newline = 0;
 359                 }
 360                 printk(KERN_CONT " %02x", addr[i]);
 361                 offset = i % 16;
 362                 ascii[offset] = isgraph(addr[i]) ? addr[i] : '.';
 363                 if (offset == 15) {
 364                         printk(KERN_CONT " %s\n", ascii);
 365                         newline = 1;
 366                 }
 367         }
 368         if (!newline) {
 369                 i %= 16;
 370                 while (i < 16) {
 371                         printk(KERN_CONT "   ");
 372                         ascii[i] = ' ';
 373                         i++;
 374                 }
 375                 printk(KERN_CONT " %s\n", ascii);
 376         }
 377 }
 378
 379 static struct track *get_track(struct kmem_cache *s, void *object,
 380         enum track_item alloc)
 381 {
 382         struct track *p;
 383
 384         if (s->offset)
 385                 p = object + s->offset + sizeof(void *);
 386         else
 387                 p = object + s->inuse;
 388
 389         return p + alloc;
 390 }
 391
 392 static void set_track(struct kmem_cache *s, void *object,
 393                                 enum track_item alloc, unsigned long addr)
 394 {
 395         struct track *p;
 396
 397         if (s->offset)
 398                 p = object + s->offset + sizeof(void *);
 399         else
 400                 p = object + s->inuse;
 401
 402         p += alloc;
 403         if (addr) {
 404                 p->addr = addr;
 405                 p->cpu = raw_smp_processor_id();
 406                 p->pid = current ? current->pid : -1;
 407                 p->when = jiffies;
 408         } else
 409                 memset(p, 0, sizeof(struct track));
 410 }
 411
 412 static void init_tracking(struct kmem_cache *s, void *object)
 413 {
 414         if (!(s->flags & SLAB_STORE_USER))
 415                 return;
 416
 417         set_track(s, object, TRACK_FREE, 0UL);
 418         set_track(s, object, TRACK_ALLOC, 0UL);
 419 }
 420
 421 static void print_track(const char *s, struct track *t)
 422 {
 423         if (!t->addr)
 424                 return;
 425
 426         printk(KERN_ERR "INFO: %s in ", s);
 427         __print_symbol("%s", (unsigned long)t->addr);
 428         printk(" age=%lu cpu=%u pid=%d\n", jiffies - t->when, t->cpu, t->pid);
 429 }
 430
 431 static void print_tracking(struct kmem_cache *s, void *object)
 432 {
 433         if (!(s->flags & SLAB_STORE_USER))
 434                 return;
 435
 436         print_track("Allocated", get_track(s, object, TRACK_ALLOC));
 437         print_track("Freed", get_track(s, object, TRACK_FREE));
 438 }
 439
 440 static void print_page_info(struct slqb_page *page)
 441 {
 442         printk(KERN_ERR "INFO: Slab 0x%p used=%u fp=0x%p flags=0x%04lx\n",
 443                 page, page->inuse, page->freelist, page->flags);
 444
 445 }
 446
 447 #define MAX_ERR_STR 100
 448 static void slab_bug(struct kmem_cache *s, char *fmt, ...)
 449 {
 450         va_list args;
 451         char buf[MAX_ERR_STR];
 452
 453         va_start(args, fmt);
 454         vsnprintf(buf, sizeof(buf), fmt, args);
 455         va_end(args);
 456         printk(KERN_ERR "========================================"
 457                         "=====================================\n");
 458         printk(KERN_ERR "BUG %s: %s\n", s->name, buf);
 459         printk(KERN_ERR "----------------------------------------"
 460                         "-------------------------------------\n\n");
 461 }
 462
 463 static void slab_fix(struct kmem_cache *s, char *fmt, ...)
 464 {
 465         va_list args;
 466         char buf[100];
 467
 468         va_start(args, fmt);
 469         vsnprintf(buf, sizeof(buf), fmt, args);
 470         va_end(args);
 471         printk(KERN_ERR "FIX %s: %s\n", s->name, buf);
 472 }
 473
 474 static void print_trailer(struct kmem_cache *s, struct slqb_page *page, u8 *p)
 475 {
 476         unsigned int off;       /* Offset of last byte */
 477         u8 *addr = slqb_page_address(page);
 478
 479         print_tracking(s, p);
 480
 481         print_page_info(page);
 482
 483         printk(KERN_ERR "INFO: Object 0x%p @offset=%tu fp=0x%p\n\n",
 484                         p, p - addr, get_freepointer(s, p));
 485
 486         if (p > addr + 16)
 487                 print_section("Bytes b4", p - 16, 16);
 488
 489         print_section("Object", p, min(s->objsize, 128));
 490
 491         if (s->flags & SLAB_RED_ZONE)
 492                 print_section("Redzone", p + s->objsize, s->inuse - s->objsize);
 493
 494         if (s->offset)
 495                 off = s->offset + sizeof(void *);
 496         else
 497                 off = s->inuse;
 498
 499         if (s->flags & SLAB_STORE_USER)
 500                 off += 2 * sizeof(struct track);
 501
 502         if (off != s->size) {
 503                 /* Beginning of the filler is the free pointer */
 504                 print_section("Padding", p + off, s->size - off);
 505         }
 506
 507         dump_stack();
 508 }
 509
 510 static void object_err(struct kmem_cache *s, struct slqb_page *page,
 511                         u8 *object, char *reason)
 512 {
 513         slab_bug(s, reason);
 514         print_trailer(s, page, object);
 515 }
 516
 517 static void slab_err(struct kmem_cache *s, struct slqb_page *page,
 518                         char *fmt, ...)
 519 {
 520         slab_bug(s, fmt);
 521         print_page_info(page);
 522         dump_stack();
 523 }
 524
 525 static void init_object(struct kmem_cache *s, void *object, int active)
 526 {
 527         u8 *p = object;
 528
 529         if (s->flags & __OBJECT_POISON) {
 530                 memset(p, POISON_FREE, s->objsize - 1);
 531                 p[s->objsize - 1] = POISON_END;
 532         }
 533
 534         if (s->flags & SLAB_RED_ZONE) {
 535                 memset(p + s->objsize,
 536                         active ? SLUB_RED_ACTIVE : SLUB_RED_INACTIVE,
 537                         s->inuse - s->objsize);
 538         }
 539 }
 540
 541 static u8 *check_bytes(u8 *start, unsigned int value, unsigned int bytes)
 542 {
 543         while (bytes) {
 544                 if (*start != (u8)value)
 545                         return start;
 546                 start++;
 547                 bytes--;
 548         }
 549         return NULL;
 550 }
 551
 552 static void restore_bytes(struct kmem_cache *s, char *message, u8 data,
 553                                 void *from, void *to)
 554 {
 555         slab_fix(s, "Restoring 0x%p-0x%p=0x%x\n", from, to - 1, data);
 556         memset(from, data, to - from);
 557 }
 558
 559 static int check_bytes_and_report(struct kmem_cache *s, struct slqb_page *page,
 560                         u8 *object, char *what,
 561                         u8 *start, unsigned int value, unsigned int bytes)
 562 {
 563         u8 *fault;
 564         u8 *end;
 565
 566         fault = check_bytes(start, value, bytes);
 567         if (!fault)
 568                 return 1;
 569
 570         end = start + bytes;
 571         while (end > fault && end[-1] == value)
 572                 end--;
 573
 574         slab_bug(s, "%s overwritten", what);
 575         printk(KERN_ERR "INFO: 0x%p-0x%p. First byte 0x%x instead of 0x%x\n",
 576                                         fault, end - 1, fault[0], value);
 577         print_trailer(s, page, object);
 578
 579         restore_bytes(s, what, value, fault, end);
 580         return 0;
 581 }
 582
 583 /*
 584  * Object layout:
 585  *
 586  * object address
 587  *      Bytes of the object to be managed.
 588  *      If the freepointer may overlay the object then the free
 589  *      pointer is the first word of the object.
 590  *
 591  *      Poisoning uses 0x6b (POISON_FREE) and the last byte is
 592  *      0xa5 (POISON_END)
 593  *
 594  * object + s->objsize
 595  *      Padding to reach word boundary. This is also used for Redzoning.
 596  *      Padding is extended by another word if Redzoning is enabled and
 597  *      objsize == inuse.
 598  *
 599  *      We fill with 0xbb (RED_INACTIVE) for inactive objects and with
 600  *      0xcc (RED_ACTIVE) for objects in use.
 601  *
 602  * object + s->inuse
 603  *      Meta data starts here.
 604  *
 605  *      A. Free pointer (if we cannot overwrite object on free)
 606  *      B. Tracking data for SLAB_STORE_USER
 607  *      C. Padding to reach required alignment boundary or at mininum
 608  *              one word if debuggin is on to be able to detect writes
 609  *              before the word boundary.
 610  *
 611  *      Padding is done using 0x5a (POISON_INUSE)
 612  *
 613  * object + s->size
 614  *      Nothing is used beyond s->size.
 615  */
 616
 617 static int check_pad_bytes(struct kmem_cache *s, struct slqb_page *page, u8 *p)
 618 {
 619         unsigned long off = s->inuse;   /* The end of info */
 620
 621         if (s->offset) {
 622                 /* Freepointer is placed after the object. */
 623                 off += sizeof(void *);
 624         }
 625
 626         if (s->flags & SLAB_STORE_USER) {
 627                 /* We also have user information there */
 628                 off += 2 * sizeof(struct track);
 629         }
 630
 631         if (s->size == off)
 632                 return 1;
 633
 634         return check_bytes_and_report(s, page, p, "Object padding",
 635                                 p + off, POISON_INUSE, s->size - off);
 636 }
 637
 638 static int slab_pad_check(struct kmem_cache *s, struct slqb_page *page)
 639 {
 640         u8 *start;
 641         u8 *fault;
 642         u8 *end;
 643         int length;
 644         int remainder;
 645
 646         if (!(s->flags & SLAB_POISON))
 647                 return 1;
 648
 649         start = slqb_page_address(page);
 650         end = start + (PAGE_SIZE << s->order);
 651         length = s->objects * s->size;
 652         remainder = end - (start + length);
 653         if (!remainder)
 654                 return 1;
 655
 656         fault = check_bytes(start + length, POISON_INUSE, remainder);
 657         if (!fault)
 658                 return 1;
 659
 660         while (end > fault && end[-1] == POISON_INUSE)
 661                 end--;
 662
 663         slab_err(s, page, "Padding overwritten. 0x%p-0x%p", fault, end - 1);
 664         print_section("Padding", start, length);
 665
 666         restore_bytes(s, "slab padding", POISON_INUSE, start, end);
 667         return 0;
 668 }
 669
 670 static int check_object(struct kmem_cache *s, struct slqb_page *page,
 671                                         void *object, int active)
 672 {
 673         u8 *p = object;
 674         u8 *endobject = object + s->objsize;
 675
 676         if (s->flags & SLAB_RED_ZONE) {
 677                 unsigned int red =
 678                         active ? SLUB_RED_ACTIVE : SLUB_RED_INACTIVE;
 679
 680                 if (!check_bytes_and_report(s, page, object, "Redzone",
 681                         endobject, red, s->inuse - s->objsize))
 682                         return 0;
 683         } else {
 684                 if ((s->flags & SLAB_POISON) && s->objsize < s->inuse) {
 685                         check_bytes_and_report(s, page, p, "Alignment padding",
 686                                 endobject, POISON_INUSE, s->inuse - s->objsize);
 687                 }
 688         }
 689
 690         if (s->flags & SLAB_POISON) {
 691                 if (!active && (s->flags & __OBJECT_POISON)) {
 692                         if (!check_bytes_and_report(s, page, p, "Poison", p,
 693                                         POISON_FREE, s->objsize - 1))
 694                                 return 0;
 695
 696                         if (!check_bytes_and_report(s, page, p, "Poison",
 697                                         p + s->objsize - 1, POISON_END, 1))
 698                                 return 0;
 699                 }
 700
 701                 /*
 702                  * check_pad_bytes cleans up on its own.
 703                  */
 704                 check_pad_bytes(s, page, p);
 705         }
 706
 707         return 1;
 708 }
 709
 710 static int check_slab(struct kmem_cache *s, struct slqb_page *page)
 711 {
 712         if (!(page->flags & PG_SLQB_BIT)) {
 713                 slab_err(s, page, "Not a valid slab page");
 714                 return 0;
 715         }
 716         if (page->inuse == 0) {
 717                 slab_err(s, page, "inuse before free / after alloc", s->name);
 718                 return 0;
 719         }
 720         if (page->inuse > s->objects) {
 721                 slab_err(s, page, "inuse %u > max %u",
 722                         s->name, page->inuse, s->objects);
 723                 return 0;
 724         }
 725         /* Slab_pad_check fixes things up after itself */
 726         slab_pad_check(s, page);
 727         return 1;
 728 }
 729
 730 static void trace(struct kmem_cache *s, struct slqb_page *page,
 731                         void *object, int alloc)
 732 {
 733         if (s->flags & SLAB_TRACE) {
 734                 printk(KERN_INFO "TRACE %s %s 0x%p inuse=%d fp=0x%p\n",
 735                         s->name,
 736                         alloc ? "alloc" : "free",
 737                         object, page->inuse,
 738                         page->freelist);
 739
 740                 if (!alloc)
 741                         print_section("Object", (void *)object, s->objsize);
 742
 743                 dump_stack();
 744         }
 745 }
 746
 747 static void setup_object_debug(struct kmem_cache *s, struct slqb_page *page,
 748                                 void *object)
 749 {
 750         if (!slab_debug(s))
 751                 return;
 752
 753         if (!(s->flags & (SLAB_STORE_USER|SLAB_RED_ZONE|__OBJECT_POISON)))
 754                 return;
 755
 756         init_object(s, object, 0);
 757         init_tracking(s, object);
 758 }
 759
 760 static int alloc_debug_processing(struct kmem_cache *s,
 761                                         void *object, unsigned long addr)
 762 {
 763         struct slqb_page *page;
 764         page = virt_to_head_slqb_page(object);
 765
 766         if (!check_slab(s, page))
 767                 goto bad;
 768
 769         if (!check_valid_pointer(s, page, object)) {
 770                 object_err(s, page, object, "Freelist Pointer check fails");
 771                 goto bad;
 772         }
 773
 774         if (object && !check_object(s, page, object, 0))
 775                 goto bad;
 776
 777         /* Success perform special debug activities for allocs */
 778         if (s->flags & SLAB_STORE_USER)
 779                 set_track(s, object, TRACK_ALLOC, addr);
 780         trace(s, page, object, 1);
 781         init_object(s, object, 1);
 782         return 1;
 783
 784 bad:
 785         return 0;
 786 }
 787
 788 static int free_debug_processing(struct kmem_cache *s,
 789                                         void *object, unsigned long addr)
 790 {
 791         struct slqb_page *page;
 792         page = virt_to_head_slqb_page(object);
 793
 794         if (!check_slab(s, page))
 795                 goto fail;
 796
 797         if (!check_valid_pointer(s, page, object)) {
 798                 slab_err(s, page, "Invalid object pointer 0x%p", object);
 799                 goto fail;
 800         }
 801
 802         if (!check_object(s, page, object, 1))
 803                 return 0;
 804
 805         /* Special debug activities for freeing objects */
 806         if (s->flags & SLAB_STORE_USER)
 807                 set_track(s, object, TRACK_FREE, addr);
 808         trace(s, page, object, 0);
 809         init_object(s, object, 0);
 810         return 1;
 811
 812 fail:
 813         slab_fix(s, "Object at 0x%p not freed", object);
 814         return 0;
 815 }
 816
 817 static int __init setup_slqb_debug(char *str)
 818 {
 819         slqb_debug = DEBUG_DEFAULT_FLAGS;
 820         if (*str++ != '=' || !*str) {
 821                 /*
 822                  * No options specified. Switch on full debugging.
 823                  */
 824                 goto out;
 825         }
 826
 827         if (*str == ',') {
 828                 /*
 829                  * No options but restriction on slabs. This means full
 830                  * debugging for slabs matching a pattern.
 831                  */
 832                 goto check_slabs;
 833         }
 834
 835         slqb_debug = 0;
 836         if (*str == '-') {
 837                 /*
 838                  * Switch off all debugging measures.
 839                  */
 840                 goto out;
 841         }
 842
 843         /*
 844          * Determine which debug features should be switched on
 845          */
 846         for (; *str && *str != ','; str++) {
 847                 switch (tolower(*str)) {
 848                 case 'f':
 849                         slqb_debug |= SLAB_DEBUG_FREE;
 850                         break;
 851                 case 'z':
 852                         slqb_debug |= SLAB_RED_ZONE;
 853                         break;
 854                 case 'p':
 855                         slqb_debug |= SLAB_POISON;
 856                         break;
 857                 case 'u':
 858                         slqb_debug |= SLAB_STORE_USER;
 859                         break;
 860                 case 't':
 861                         slqb_debug |= SLAB_TRACE;
 862                         break;
 863                 case 'a':
 864                         slqb_debug |= SLAB_FAILSLAB;
 865                         break;
 866                 default:
 867                         printk(KERN_ERR "slqb_debug option '%c' "
 868                                 "unknown. skipped\n", *str);
 869                 }
 870         }
 871
 872 check_slabs:
 873         if (*str == ',')
 874                 slqb_debug_slabs = str + 1;
 875 out:
 876         return 1;
 877 }
 878 __setup("slqb_debug", setup_slqb_debug);
 879
 880 static int __init setup_slqb_min_order(char *str)
 881 {
 882         get_option(&str, &slqb_min_order);
 883         slqb_min_order = min(slqb_min_order, MAX_ORDER - 1);
 884
 885         return 1;
 886 }
 887 __setup("slqb_min_order=", setup_slqb_min_order);
 888
 889 static int __init setup_slqb_min_objects(char *str)
 890 {
 891         get_option(&str, &slqb_min_objects);
 892
 893         return 1;
 894 }
 895
 896 __setup("slqb_min_objects=", setup_slqb_min_objects);
 897
 898 static unsigned long kmem_cache_flags(unsigned long objsize,
 899                                 unsigned long flags, const char *name,
 900                                 void (*ctor)(void *))
 901 {
 902         /*
 903          * Enable debugging if selected on the kernel commandline.
 904          */
 905         if (slqb_debug && (!slqb_debug_slabs ||
 906             strncmp(slqb_debug_slabs, name,
 907                 strlen(slqb_debug_slabs)) == 0))
 908                         flags |= slqb_debug;
 909
 910         if (num_possible_nodes() > 1)
 911                 flags |= SLAB_NUMA;
 912
 913         return flags;
 914 }
 915 #else
 916 static inline void setup_object_debug(struct kmem_cache *s,
 917                         struct slqb_page *page, void *object)
 918 {
 919 }
 920
 921 static inline int alloc_debug_processing(struct kmem_cache *s,
 922                         void *object, unsigned long addr)
 923 {
 924         return 0;
 925 }
 926
 927 static inline int free_debug_processing(struct kmem_cache *s,
 928                         void *object, unsigned long addr)
 929 {
 930         return 0;
 931 }
 932
 933 static inline int slab_pad_check(struct kmem_cache *s, struct slqb_page *page)
 934 {
 935         return 1;
 936 }
 937
 938 static inline int check_object(struct kmem_cache *s, struct slqb_page *page,
 939                         void *object, int active)
 940 {
 941         return 1;
 942 }
 943
 944 static inline void add_full(struct kmem_cache_node *n, struct slqb_page *page)
 945 {
 946 }
 947
 948 static inline unsigned long kmem_cache_flags(unsigned long objsize,
 949         unsigned long flags, const char *name, void (*ctor)(void *))
 950 {
 951         if (num_possible_nodes() > 1)
 952                 flags |= SLAB_NUMA;
 953         return flags;
 954 }
 955
 956 static const int slqb_debug;
 957 #endif
 958
 959 /*
 960  * allocate a new slab (return its corresponding struct slqb_page)
 961  */
 962 static struct slqb_page *allocate_slab(struct kmem_cache *s,
 963                                         gfp_t flags, int node)
 964 {
 965         struct slqb_page *page;
 966         int pages = 1 << s->order;
 967
 968         flags |= s->allocflags;
 969
 970         page = (struct slqb_page *)alloc_pages_node(node, flags, s->order);
 971         if (!page)
 972                 return NULL;
 973
 974         mod_zone_page_state(slqb_page_zone(page),
 975                 (s->flags & SLAB_RECLAIM_ACCOUNT) ?
 976                 NR_SLAB_RECLAIMABLE : NR_SLAB_UNRECLAIMABLE,
 977                 pages);
 978
 979         return page;
 980 }
 981
 982 /*
 983  * Called once for each object on a new slab page
 984  */
 985 static void setup_object(struct kmem_cache *s,
 986                                 struct slqb_page *page, void *object)
 987 {
 988         setup_object_debug(s, page, object);
 989         if (unlikely(s->ctor))
 990                 s->ctor(object);
 991 }
 992
 993 /*
 994  * Allocate a new slab, set up its object list.
 995  */
 996 static struct slqb_page *new_slab_page(struct kmem_cache *s,
 997                                 gfp_t flags, int node, unsigned int colour)
 998 {
 999         struct slqb_page *page;
1000         void *start;
1001         void *last;
1002         void *p;
1003
1004         BUG_ON(flags & GFP_SLAB_BUG_MASK);
1005
1006         page = allocate_slab(s,
1007                 flags & (GFP_RECLAIM_MASK | GFP_CONSTRAINT_MASK), node);
1008         if (!page)
1009                 goto out;
1010
1011         page->flags |= PG_SLQB_BIT;
1012
1013         start = page_address(&page->page);
1014
1015         if (unlikely(slab_poison(s)))
1016                 memset(start, POISON_INUSE, PAGE_SIZE << s->order);
1017
1018         start += colour;
1019
1020         last = start;
1021         for_each_object(p, s, start) {
1022                 setup_object(s, page, p);
1023                 set_freepointer(s, last, p);
1024                 last = p;
1025         }
1026         set_freepointer(s, last, NULL);
1027
1028         page->freelist = start;
1029         page->inuse = 0;
1030 out:
1031         return page;
1032 }
1033
1034 /*
1035  * Free a slab page back to the page allocator
1036  */
1037 static void __free_slab(struct kmem_cache *s, struct slqb_page *page)
1038 {
1039         int pages = 1 << s->order;
1040
1041         if (unlikely(slab_debug(s))) {
1042                 void *p;
1043
1044                 slab_pad_check(s, page);
1045                 for_each_free_object(p, s, page->freelist)
1046                         check_object(s, page, p, 0);
1047         }
1048
1049         mod_zone_page_state(slqb_page_zone(page),
1050                 (s->flags & SLAB_RECLAIM_ACCOUNT) ?
1051                 NR_SLAB_RECLAIMABLE : NR_SLAB_UNRECLAIMABLE,
1052                 -pages);
1053
1054         __free_slqb_pages(page, s->order, pages);
1055 }
1056
1057 static void rcu_free_slab(struct rcu_head *h)
1058 {
1059         struct slqb_page *page;
1060
1061         page = container_of(h, struct slqb_page, rcu_head);
1062         __free_slab(page->list->cache, page);
1063 }
1064
1065 static void free_slab(struct kmem_cache *s, struct slqb_page *page)
1066 {
1067         VM_BUG_ON(page->inuse);
1068         if (unlikely(s->flags & SLAB_DESTROY_BY_RCU))
1069                 call_rcu(&page->rcu_head, rcu_free_slab);
1070         else
1071                 __free_slab(s, page);
1072 }
1073
1074 /*
1075  * Return an object to its slab.
1076  *
1077  * Caller must be the owner CPU in the case of per-CPU list, or hold the node's
1078  * list_lock in the case of per-node list.
1079  */
1080 static int free_object_to_page(struct kmem_cache *s,
1081                         struct kmem_cache_list *l, struct slqb_page *page,
1082                         void *object)
1083 {
1084         VM_BUG_ON(page->list != l);
1085
1086         set_freepointer(s, object, page->freelist);
1087         page->freelist = object;
1088         page->inuse--;
1089
1090         if (!page->inuse) {
1091                 if (likely(s->objects > 1)) {
1092                         l->nr_partial--;
1093                         list_del(&page->lru);
1094                 }
1095                 l->nr_slabs--;
1096                 free_slab(s, page);
1097                 slqb_stat_inc(l, FLUSH_SLAB_FREE);
1098                 return 1;
1099
1100         } else if (page->inuse + 1 == s->objects) {
1101                 l->nr_partial++;
1102                 list_add(&page->lru, &l->partial);
1103                 slqb_stat_inc(l, FLUSH_SLAB_PARTIAL);
1104                 return 0;
1105         }
1106         return 0;
1107 }
1108
1109 #ifdef CONFIG_SMP
1110 static void slab_free_to_remote(struct kmem_cache *s, struct slqb_page *page,
1111                                 void *object, struct kmem_cache_cpu *c);
1112 #endif
1113
1114 /*
1115  * Flush the LIFO list of objects on a list. They are sent back to their pages
1116  * in case the pages also belong to the list, or to our CPU's remote-free list
1117  * in the case they do not.
1118  *
1119  * Doesn't flush the entire list. flush_free_list_all does.
1120  *
1121  * Caller must be the owner CPU in the case of per-CPU list, or hold the node's
1122  * list_lock in the case of per-node list.
1123  */
1124 static void flush_free_list(struct kmem_cache *s, struct kmem_cache_list *l)
1125 {
1126         void **head;
1127         int nr;
1128         int locked = 0;
1129
1130         nr = l->freelist.nr;
1131         if (unlikely(!nr))
1132                 return;
1133
1134         nr = min(slab_freebatch(s), nr);
1135
1136         slqb_stat_inc(l, FLUSH_FREE_LIST);
1137         slqb_stat_add(l, FLUSH_FREE_LIST_OBJECTS, nr);
1138
1139         l->freelist.nr -= nr;
1140         head = l->freelist.head;
1141
1142         do {
1143                 struct slqb_page *page;
1144                 void **object;
1145
1146                 object = head;
1147                 VM_BUG_ON(!object);
1148                 head = get_freepointer(s, object);
1149                 page = virt_to_head_slqb_page(object);
1150
1151 #ifdef CONFIG_SMP
1152                 if (page->list != l) {
1153                         struct kmem_cache_cpu *c;
1154
1155                         if (locked) {
1156                                 spin_unlock(&l->page_lock);
1157                                 locked = 0;
1158                         }
1159
1160                         c = get_cpu_slab(s, smp_processor_id());
1161
1162                         slab_free_to_remote(s, page, object, c);
1163                         slqb_stat_inc(l, FLUSH_FREE_LIST_REMOTE);
1164                 } else
1165 #endif
1166                 {
1167                         if (!locked) {
1168                                 spin_lock(&l->page_lock);
1169                                 locked = 1;
1170                         }
1171                         free_object_to_page(s, l, page, object);
1172                 }
1173
1174                 nr--;
1175         } while (nr);
1176
1177         if (locked)
1178                 spin_unlock(&l->page_lock);
1179
1180         l->freelist.head = head;
1181         if (!l->freelist.nr)
1182                 l->freelist.tail = NULL;
1183 }
1184
1185 static void flush_free_list_all(struct kmem_cache *s, struct kmem_cache_list *l)
1186 {
1187         while (l->freelist.nr)
1188                 flush_free_list(s, l);
1189 }
1190
1191 #ifdef CONFIG_SMP
1192 /*
1193  * If enough objects have been remotely freed back to this list,
1194  * remote_free_check will be set. In which case, we'll eventually come here
1195  * to take those objects off our remote_free list and onto our LIFO freelist.
1196  *
1197  * Caller must be the owner CPU in the case of per-CPU list, or hold the node's
1198  * list_lock in the case of per-node list.
1199  */
1200 static void claim_remote_free_list(struct kmem_cache *s,
1201                                         struct kmem_cache_list *l)
1202 {
1203         void **head, **tail;
1204         int nr;
1205
1206         if (!l->remote_free.list.nr)
1207                 return;
1208
1209         spin_lock(&l->remote_free.lock);
1210
1211         l->remote_free_check = 0;
1212         head = l->remote_free.list.head;
1213         l->remote_free.list.head = NULL;
1214         tail = l->remote_free.list.tail;
1215         l->remote_free.list.tail = NULL;
1216         nr = l->remote_free.list.nr;
1217         l->remote_free.list.nr = 0;
1218
1219         spin_unlock(&l->remote_free.lock);
1220
1221         VM_BUG_ON(!nr);
1222
1223         if (!l->freelist.nr) {
1224                 /* Get head hot for likely subsequent allocation or flush */
1225                 prefetchw(head);
1226                 l->freelist.head = head;
1227         } else
1228                 set_freepointer(s, l->freelist.tail, head);
1229         l->freelist.tail = tail;
1230
1231         l->freelist.nr += nr;
1232
1233         slqb_stat_inc(l, CLAIM_REMOTE_LIST);
1234         slqb_stat_add(l, CLAIM_REMOTE_LIST_OBJECTS, nr);
1235 }
1236 #else
1237 static inline void claim_remote_free_list(struct kmem_cache *s,
1238                                         struct kmem_cache_list *l)
1239 {
1240 }
1241 #endif
1242
1243 /*
1244  * Allocation fastpath. Get an object from the list's LIFO freelist, or
1245  * return NULL if it is empty.
1246  *
1247  * Caller must be the owner CPU in the case of per-CPU list, or hold the node's
1248  * list_lock in the case of per-node list.
1249  */
1250 static __always_inline void *__cache_list_get_object(struct kmem_cache *s,
1251                                                 struct kmem_cache_list *l)
1252 {
1253         void *object;
1254
1255         object = l->freelist.head;
1256         if (likely(object)) {
1257                 void *next = get_freepointer(s, object);
1258
1259                 VM_BUG_ON(!l->freelist.nr);
1260                 l->freelist.nr--;
1261                 l->freelist.head = next;
1262
1263                 return object;
1264         }
1265         VM_BUG_ON(l->freelist.nr);
1266
1267 #ifdef CONFIG_SMP
1268         if (unlikely(l->remote_free_check)) {
1269                 claim_remote_free_list(s, l);
1270
1271                 if (l->freelist.nr > slab_hiwater(s))
1272                         flush_free_list(s, l);
1273
1274                 /* repetition here helps gcc :( */
1275                 object = l->freelist.head;
1276                 if (likely(object)) {
1277                         void *next = get_freepointer(s, object);
1278
1279                         VM_BUG_ON(!l->freelist.nr);
1280                         l->freelist.nr--;
1281                         l->freelist.head = next;
1282
1283                         return object;
1284                 }
1285                 VM_BUG_ON(l->freelist.nr);
1286         }
1287 #endif
1288
1289         return NULL;
1290 }
1291
1292 /*
1293  * Slow(er) path. Get a page from this list's existing pages. Will be a
1294  * new empty page in the case that __slab_alloc_page has just been called
1295  * (empty pages otherwise never get queued up on the lists), or a partial page
1296  * already on the list.
1297  *
1298  * Caller must be the owner CPU in the case of per-CPU list, or hold the node's
1299  * list_lock in the case of per-node list.
1300  */
1301 static noinline void *__cache_list_get_page(struct kmem_cache *s,
1302                                 struct kmem_cache_list *l)
1303 {
1304         struct slqb_page *page;
1305         void *object;
1306
1307         if (unlikely(!l->nr_partial))
1308                 return NULL;
1309
1310         page = list_first_entry(&l->partial, struct slqb_page, lru);
1311         VM_BUG_ON(page->inuse == s->objects);
1312         if (page->inuse + 1 == s->objects) {
1313                 l->nr_partial--;
1314                 list_del(&page->lru);
1315         }
1316
1317         VM_BUG_ON(!page->freelist);
1318
1319         page->inuse++;
1320
1321         object = page->freelist;
1322         page->freelist = get_freepointer(s, object);
1323         if (page->freelist)
1324                 prefetchw(page->freelist);
1325         VM_BUG_ON((page->inuse == s->objects) != (page->freelist == NULL));
1326         slqb_stat_inc(l, ALLOC_SLAB_FILL);
1327
1328         return object;
1329 }
1330
1331 static void *cache_list_get_page(struct kmem_cache *s,
1332                                 struct kmem_cache_list *l)
1333 {
1334         void *object;
1335
1336         if (unlikely(!l->nr_partial))
1337                 return NULL;
1338
1339         spin_lock(&l->page_lock);
1340         object = __cache_list_get_page(s, l);
1341         spin_unlock(&l->page_lock);
1342
1343         return object;
1344 }
1345
1346 /*
1347  * Allocation slowpath. Allocate a new slab page from the page allocator, and
1348  * put it on the list's partial list. Must be followed by an allocation so
1349  * that we don't have dangling empty pages on the partial list.
1350  *
1351  * Returns 0 on allocation failure.
1352  *
1353  * Must be called with interrupts disabled.
1354  */
1355 static noinline void *__slab_alloc_page(struct kmem_cache *s,
1356                                 gfp_t gfpflags, int node)
1357 {
1358         struct slqb_page *page;
1359         struct kmem_cache_list *l;
1360         struct kmem_cache_cpu *c;
1361         unsigned int colour;
1362         void *object;
1363
1364         c = get_cpu_slab(s, smp_processor_id());
1365         colour = c->colour_next;
1366         c->colour_next += s->colour_off;
1367         if (c->colour_next >= s->colour_range)
1368                 c->colour_next = 0;
1369
1370         /* Caller handles __GFP_ZERO */
1371         gfpflags &= ~__GFP_ZERO;
1372
1373         if (gfpflags & __GFP_WAIT)
1374                 local_irq_enable();
1375         page = new_slab_page(s, gfpflags, node, colour);
1376         if (gfpflags & __GFP_WAIT)
1377                 local_irq_disable();
1378         if (unlikely(!page))
1379                 return page;
1380
1381         if (!NUMA_BUILD || likely(slqb_page_to_nid(page) == numa_node_id())) {
1382                 struct kmem_cache_cpu *c;
1383                 int cpu = smp_processor_id();
1384
1385                 c = get_cpu_slab(s, cpu);
1386                 l = &c->list;
1387                 page->list = l;
1388
1389                 spin_lock(&l->page_lock);
1390                 l->nr_slabs++;
1391                 l->nr_partial++;
1392                 list_add(&page->lru, &l->partial);
1393                 slqb_stat_inc(l, ALLOC);
1394                 slqb_stat_inc(l, ALLOC_SLAB_NEW);
1395                 object = __cache_list_get_page(s, l);
1396                 spin_unlock(&l->page_lock);
1397         } else {
1398 #ifdef CONFIG_NUMA
1399                 struct kmem_cache_node *n;
1400
1401                 n = s->node_slab[slqb_page_to_nid(page)];
1402                 l = &n->list;
1403                 page->list = l;
1404
1405                 spin_lock(&n->list_lock);
1406                 spin_lock(&l->page_lock);
1407                 l->nr_slabs++;
1408                 l->nr_partial++;
1409                 list_add(&page->lru, &l->partial);
1410                 slqb_stat_inc(l, ALLOC);
1411                 slqb_stat_inc(l, ALLOC_SLAB_NEW);
1412                 object = __cache_list_get_page(s, l);
1413                 spin_unlock(&l->page_lock);
1414                 spin_unlock(&n->list_lock);
1415 #endif
1416         }
1417         VM_BUG_ON(!object);
1418         return object;
1419 }
1420
1421 #ifdef CONFIG_NUMA
1422 static noinline int alternate_nid(struct kmem_cache *s,
1423                                 gfp_t gfpflags, int node)
1424 {
1425         if (in_interrupt() || (gfpflags & __GFP_THISNODE))
1426                 return node;
1427         if (cpuset_do_slab_mem_spread() && (s->flags & SLAB_MEM_SPREAD))
1428                 return cpuset_mem_spread_node();
1429         else if (current->mempolicy)
1430                 return slab_node(current->mempolicy);
1431         return node;
1432 }
1433
1434 /*
1435  * Allocate an object from a remote node. Return NULL if none could be found
1436  * (in which case, caller should allocate a new slab)
1437  *
1438  * Must be called with interrupts disabled.
1439  */
1440 static void *__remote_slab_alloc_node(struct kmem_cache *s,
1441                                 gfp_t gfpflags, int node)
1442 {
1443         struct kmem_cache_node *n;
1444         struct kmem_cache_list *l;
1445         void *object;
1446
1447         n = s->node_slab[node];
1448         if (unlikely(!n)) /* node has no memory */
1449                 return NULL;
1450         l = &n->list;
1451
1452         spin_lock(&n->list_lock);
1453
1454         object = __cache_list_get_object(s, l);
1455         if (unlikely(!object)) {
1456                 object = cache_list_get_page(s, l);
1457                 if (unlikely(!object)) {
1458                         spin_unlock(&n->list_lock);
1459                         return __slab_alloc_page(s, gfpflags, node);
1460                 }
1461         }
1462         if (likely(object))
1463                 slqb_stat_inc(l, ALLOC);
1464         spin_unlock(&n->list_lock);
1465         return object;
1466 }
1467
1468 static noinline void *__remote_slab_alloc(struct kmem_cache *s,
1469                                 gfp_t gfpflags, int node)
1470 {
1471         void *object;
1472         struct zonelist *zonelist;
1473         struct zoneref *z;
1474         struct zone *zone;
1475         enum zone_type high_zoneidx = gfp_zone(gfpflags);
1476
1477         object = __remote_slab_alloc_node(s, gfpflags, node);
1478         if (likely(object || (gfpflags & __GFP_THISNODE)))
1479                 return object;
1480
1481         zonelist = node_zonelist(slab_node(current->mempolicy), gfpflags);
1482         for_each_zone_zonelist(zone, z, zonelist, high_zoneidx) {
1483                 if (!cpuset_zone_allowed_hardwall(zone, gfpflags))
1484                         continue;
1485
1486                 node = zone_to_nid(zone);
1487                 object = __remote_slab_alloc_node(s, gfpflags, node);
1488                 if (likely(object))
1489                         return object;
1490         }
1491         return NULL;
1492 }
1493 #endif
1494
1495 /*
1496  * Main allocation path. Return an object, or NULL on allocation failure.
1497  *
1498  * Must be called with interrupts disabled.
1499  */
1500 static __always_inline void *__slab_alloc(struct kmem_cache *s,
1501                                 gfp_t gfpflags, int node)
1502 {
1503         void *object;
1504         struct kmem_cache_cpu *c;
1505         struct kmem_cache_list *l;
1506
1507 #ifdef CONFIG_NUMA
1508         if (unlikely(node != -1) && unlikely(node != numa_node_id())) {
1509 try_remote:
1510                 return __remote_slab_alloc(s, gfpflags, node);
1511         }
1512 #endif
1513
1514         c = get_cpu_slab(s, smp_processor_id());
1515         VM_BUG_ON(!c);
1516         l = &c->list;
1517         object = __cache_list_get_object(s, l);
1518         if (unlikely(!object)) {
1519 #ifdef CONFIG_NUMA
1520                 int thisnode = numa_node_id();
1521
1522                 /*
1523                  * If the local node is memoryless, try remote alloc before
1524                  * trying the page allocator. Otherwise, what happens is
1525                  * objects are always freed to remote lists but the allocation
1526                  * side always allocates a new page with only one object
1527                  * used in each page
1528                  */
1529                 if (unlikely(!node_state(thisnode, N_HIGH_MEMORY)))
1530                         object = __remote_slab_alloc(s, gfpflags, thisnode);
1531 #endif
1532
1533                 if (!object) {
1534                         object = cache_list_get_page(s, l);
1535                         if (unlikely(!object)) {
1536                                 object = __slab_alloc_page(s, gfpflags, node);
1537 #ifdef CONFIG_NUMA
1538                                 if (unlikely(!object)) {
1539                                         node = numa_node_id();
1540                                         goto try_remote;
1541                                 }
1542 #endif
1543                                 return object;
1544                         }
1545                 }
1546         }
1547         if (likely(object))
1548                 slqb_stat_inc(l, ALLOC);
1549         return object;
1550 }
1551
1552 /*
1553  * Perform some interrupts-on processing around the main allocation path
1554  * (debug checking and memset()ing).
1555  */
1556 static __always_inline void *slab_alloc(struct kmem_cache *s,
1557                                 gfp_t gfpflags, int node, unsigned long addr)
1558 {
1559         void *object;
1560         unsigned long flags;
1561
1562         gfpflags &= gfp_allowed_mask;
1563
1564         lockdep_trace_alloc(gfpflags);
1565         might_sleep_if(gfpflags & __GFP_WAIT);
1566
1567         if (should_failslab(s->objsize, gfpflags, s->flags))
1568                 return NULL;
1569
1570 again:
1571         local_irq_save(flags);
1572         object = __slab_alloc(s, gfpflags, node);
1573         local_irq_restore(flags);
1574
1575         if (unlikely(slab_debug(s)) && likely(object)) {
1576                 if (unlikely(!alloc_debug_processing(s, object, addr)))
1577                         goto again;
1578         }
1579
1580         if (unlikely(gfpflags & __GFP_ZERO) && likely(object))
1581                 memset(object, 0, s->objsize);
1582
1583         return object;
1584 }
1585
1586 static __always_inline void *__kmem_cache_alloc(struct kmem_cache *s,
1587                                 gfp_t gfpflags, unsigned long caller)
1588 {
1589         int node = -1;
1590
1591 #ifdef CONFIG_NUMA
1592         if (unlikely(current->flags & (PF_SPREAD_SLAB | PF_MEMPOLICY)))
1593                 node = alternate_nid(s, gfpflags, node);
1594 #endif
1595         return slab_alloc(s, gfpflags, node, caller);
1596 }
1597
1598 void *kmem_cache_alloc(struct kmem_cache *s, gfp_t gfpflags)
1599 {
1600         return __kmem_cache_alloc(s, gfpflags, _RET_IP_);
1601 }
1602 EXPORT_SYMBOL(kmem_cache_alloc);
1603
1604 #ifdef CONFIG_NUMA
1605 void *kmem_cache_alloc_node(struct kmem_cache *s, gfp_t gfpflags, int node)
1606 {
1607         return slab_alloc(s, gfpflags, node, _RET_IP_);
1608 }
1609 EXPORT_SYMBOL(kmem_cache_alloc_node);
1610 #endif
1611
1612 #ifdef CONFIG_SMP
1613 /*
1614  * Flush this CPU's remote free list of objects back to the list from where
1615  * they originate. They end up on that list's remotely freed list, and
1616  * eventually we set it's remote_free_check if there are enough objects on it.
1617  *
1618  * This seems convoluted, but it keeps is from stomping on the target CPU's
1619  * fastpath cachelines.
1620  *
1621  * Must be called with interrupts disabled.
1622  */
1623 static void flush_remote_free_cache(struct kmem_cache *s,
1624                                 struct kmem_cache_cpu *c)
1625 {
1626         struct kmlist *src;
1627         struct kmem_cache_list *dst;
1628         unsigned int nr;
1629         int set;
1630
1631         src = &c->rlist;
1632         nr = src->nr;
1633         if (unlikely(!nr))
1634                 return;
1635
1636 #ifdef CONFIG_SLQB_STATS
1637         {
1638                 struct kmem_cache_list *l = &c->list;
1639
1640                 slqb_stat_inc(l, FLUSH_RFREE_LIST);
1641                 slqb_stat_add(l, FLUSH_RFREE_LIST_OBJECTS, nr);
1642         }
1643 #endif
1644
1645         dst = c->remote_cache_list;
1646
1647         /*
1648          * Less common case, dst is filling up so free synchronously.
1649          * No point in having remote CPU free thse as it will just
1650          * free them back to the page list anyway.
1651          */
1652         if (unlikely(dst->remote_free.list.nr > (slab_hiwater(s) >> 1))) {
1653                 void **head;
1654
1655                 head = src->head;
1656                 spin_lock(&dst->page_lock);
1657                 do {
1658                         struct slqb_page *page;
1659                         void **object;
1660
1661                         object = head;
1662                         VM_BUG_ON(!object);
1663                         head = get_freepointer(s, object);
1664                         page = virt_to_head_slqb_page(object);
1665
1666                         free_object_to_page(s, dst, page, object);
1667                         nr--;
1668                 } while (nr);
1669                 spin_unlock(&dst->page_lock);
1670
1671                 src->head = NULL;
1672                 src->tail = NULL;
1673                 src->nr = 0;
1674
1675                 return;
1676         }
1677
1678         spin_lock(&dst->remote_free.lock);
1679
1680         if (!dst->remote_free.list.head)
1681                 dst->remote_free.list.head = src->head;
1682         else
1683                 set_freepointer(s, dst->remote_free.list.tail, src->head);
1684         dst->remote_free.list.tail = src->tail;
1685
1686         src->head = NULL;
1687         src->tail = NULL;
1688         src->nr = 0;
1689
1690         if (dst->remote_free.list.nr < slab_freebatch(s))
1691                 set = 1;
1692         else
1693                 set = 0;
1694
1695         dst->remote_free.list.nr += nr;
1696
1697         if (unlikely(dst->remote_free.list.nr >= slab_freebatch(s) && set))
1698                 dst->remote_free_check = 1;
1699
1700         spin_unlock(&dst->remote_free.lock);
1701 }
1702
1703 /*
1704  * Free an object to this CPU's remote free list.
1705  *
1706  * Must be called with interrupts disabled.
1707  */
1708 static noinline void slab_free_to_remote(struct kmem_cache *s,
1709                                 struct slqb_page *page, void *object,
1710                                 struct kmem_cache_cpu *c)
1711 {
1712         struct kmlist *r;
1713
1714         /*
1715          * Our remote free list corresponds to a different list. Must
1716          * flush it and switch.
1717          */
1718         if (page->list != c->remote_cache_list) {
1719                 flush_remote_free_cache(s, c);
1720                 c->remote_cache_list = page->list;
1721         }
1722
1723         r = &c->rlist;
1724         if (!r->head)
1725                 r->head = object;
1726         else
1727                 set_freepointer(s, r->tail, object);
1728         set_freepointer(s, object, NULL);
1729         r->tail = object;
1730         r->nr++;
1731
1732         if (unlikely(r->nr >= slab_freebatch(s)))
1733                 flush_remote_free_cache(s, c);
1734 }
1735 #endif
1736
1737 /*
1738  * Main freeing path. Return an object, or NULL on allocation failure.
1739  *
1740  * Must be called with interrupts disabled.
1741  */
1742 static __always_inline void __slab_free(struct kmem_cache *s,
1743                                 struct slqb_page *page, void *object)
1744 {
1745         struct kmem_cache_cpu *c;
1746         struct kmem_cache_list *l;
1747         int thiscpu = smp_processor_id();
1748
1749         c = get_cpu_slab(s, thiscpu);
1750         l = &c->list;
1751
1752         slqb_stat_inc(l, FREE);
1753
1754         if (!NUMA_BUILD || !slab_numa(s) ||
1755                         likely(slqb_page_to_nid(page) == numa_node_id())) {
1756                 /*
1757                  * Freeing fastpath. Collects all local-node objects, not
1758                  * just those allocated from our per-CPU list. This allows
1759                  * fast transfer of objects from one CPU to another within
1760                  * a given node.
1761                  */
1762                 set_freepointer(s, object, l->freelist.head);
1763                 l->freelist.head = object;
1764                 if (!l->freelist.nr)
1765                         l->freelist.tail = object;
1766                 l->freelist.nr++;
1767
1768                 if (unlikely(l->freelist.nr > slab_hiwater(s)))
1769                         flush_free_list(s, l);
1770
1771         } else {
1772 #ifdef CONFIG_SMP
1773                 /*
1774                  * Freeing an object that was allocated on a remote node.
1775                  */
1776                 slab_free_to_remote(s, page, object, c);
1777                 slqb_stat_inc(l, FREE_REMOTE);
1778 #endif
1779         }
1780 }
1781
1782 /*
1783  * Perform some interrupts-on processing around the main freeing path
1784  * (debug checking).
1785  */
1786 static __always_inline void slab_free(struct kmem_cache *s,
1787                                 struct slqb_page *page, void *object)
1788 {
1789         unsigned long flags;
1790
1791         prefetchw(object);
1792
1793         debug_check_no_locks_freed(object, s->objsize);
1794         if (likely(object) && unlikely(slab_debug(s))) {
1795                 if (unlikely(!free_debug_processing(s, object, _RET_IP_)))
1796                         return;
1797         }
1798
1799         local_irq_save(flags);
1800         __slab_free(s, page, object);
1801         local_irq_restore(flags);
1802 }
1803
1804 void kmem_cache_free(struct kmem_cache *s, void *object)
1805 {
1806         struct slqb_page *page = NULL;
1807
1808         if (slab_numa(s))
1809                 page = virt_to_head_slqb_page(object);
1810         slab_free(s, page, object);
1811 }
1812 EXPORT_SYMBOL(kmem_cache_free);
1813
1814 /*
1815  * Calculate the order of allocation given an slab object size.
1816  *
1817  * Order 0 allocations are preferred since order 0 does not cause fragmentation
1818  * in the page allocator, and they have fastpaths in the page allocator. But
1819  * also minimise external fragmentation with large objects.
1820  */
1821 static int slab_order(int size, int max_order, int frac)
1822 {
1823         int order;
1824
1825         if (fls(size - 1) <= PAGE_SHIFT)
1826                 order = 0;
1827         else
1828                 order = fls(size - 1) - PAGE_SHIFT;
1829         if (order < slqb_min_order)
1830                 order = slqb_min_order;
1831
1832         while (order <= max_order) {
1833                 unsigned long slab_size = PAGE_SIZE << order;
1834                 unsigned long objects;
1835                 unsigned long waste;
1836
1837                 objects = slab_size / size;
1838                 if (!objects)
1839                         goto next;
1840
1841                 if (order < MAX_ORDER && objects < slqb_min_objects) {
1842                         /*
1843                          * if we don't have enough objects for min_objects,
1844                          * then try the next size up. Unless we have reached
1845                          * our maximum possible page size.
1846                          */
1847                         goto next;
1848                 }
1849
1850                 waste = slab_size - (objects * size);
1851
1852                 if (waste * frac <= slab_size)
1853                         break;
1854
1855 next:
1856                 order++;
1857         }
1858
1859         return order;
1860 }
1861
1862 static int calculate_order(int size)
1863 {
1864         int order;
1865
1866         /*
1867          * Attempt to find best configuration for a slab. This
1868          * works by first attempting to generate a layout with
1869          * the best configuration and backing off gradually.
1870          */
1871         order = slab_order(size, 1, 4);
1872         if (order <= 1)
1873                 return order;
1874
1875         /*
1876          * This size cannot fit in order-1. Allow bigger orders, but
1877          * forget about trying to save space.
1878          */
1879         order = slab_order(size, MAX_ORDER - 1, 0);
1880         if (order < MAX_ORDER)
1881                 return order;
1882
1883         return -ENOSYS;
1884 }
1885
1886 /*
1887  * Figure out what the alignment of the objects will be.
1888  */
1889 static unsigned long calculate_alignment(unsigned long flags,
1890                                 unsigned long align, unsigned long size)
1891 {
1892         /*
1893          * If the user wants hardware cache aligned objects then follow that
1894          * suggestion if the object is sufficiently large.
1895          *
1896          * The hardware cache alignment cannot override the specified
1897          * alignment though. If that is greater then use it.
1898          */
1899         if (flags & SLAB_HWCACHE_ALIGN) {
1900                 unsigned long ralign = cache_line_size();
1901
1902                 while (size <= ralign / 2)
1903                         ralign /= 2;
1904                 align = max(align, ralign);
1905         }
1906
1907         if (align < ARCH_SLAB_MINALIGN)
1908                 align = ARCH_SLAB_MINALIGN;
1909
1910         return ALIGN(align, sizeof(void *));
1911 }
1912
1913 static void init_kmem_cache_list(struct kmem_cache *s,
1914                                 struct kmem_cache_list *l)
1915 {
1916         l->cache                = s;
1917         l->freelist.nr          = 0;
1918         l->freelist.head        = NULL;
1919         l->freelist.tail        = NULL;
1920         l->nr_partial           = 0;
1921         l->nr_slabs             = 0;
1922         INIT_LIST_HEAD(&l->partial);
1923         spin_lock_init(&l->page_lock);
1924
1925 #ifdef CONFIG_SMP
1926         l->remote_free_check    = 0;
1927         spin_lock_init(&l->remote_free.lock);
1928         l->remote_free.list.nr  = 0;
1929         l->remote_free.list.head = NULL;
1930         l->remote_free.list.tail = NULL;
1931 #endif
1932
1933 #ifdef CONFIG_SLQB_STATS
1934         memset(l->stats, 0, sizeof(l->stats));
1935 #endif
1936 }
1937
1938 static void init_kmem_cache_cpu(struct kmem_cache *s,
1939                                 struct kmem_cache_cpu *c)
1940 {
1941         init_kmem_cache_list(s, &c->list);
1942
1943         c->colour_next          = 0;
1944 #ifdef CONFIG_SMP
1945         c->rlist.nr             = 0;
1946         c->rlist.head           = NULL;
1947         c->rlist.tail           = NULL;
1948         c->remote_cache_list    = NULL;
1949 #endif
1950 }
1951
1952 #ifdef CONFIG_NUMA
1953 static void init_kmem_cache_node(struct kmem_cache *s,
1954                                 struct kmem_cache_node *n)
1955 {
1956         spin_lock_init(&n->list_lock);
1957         init_kmem_cache_list(s, &n->list);
1958 }
1959 #endif
1960
1961 /* Initial slabs. */
1962 #ifdef CONFIG_SMP
1963 static DEFINE_PER_CPU(struct kmem_cache_cpu, kmem_cache_cpus);
1964 #endif
1965 #ifdef CONFIG_NUMA
1966 /* XXX: really need a DEFINE_PER_NODE for per-node data because a static
1967  *      array is wasteful */
1968 static struct kmem_cache_node kmem_cache_nodes[MAX_NUMNODES];
1969 #endif
1970
1971 #ifdef CONFIG_SMP
1972 static struct kmem_cache kmem_cpu_cache;
1973 static DEFINE_PER_CPU(struct kmem_cache_cpu, kmem_cpu_cpus);
1974 #ifdef CONFIG_NUMA
1975 static struct kmem_cache_node kmem_cpu_nodes[MAX_NUMNODES]; /* XXX per-nid */
1976 #endif
1977 #endif
1978
1979 #ifdef CONFIG_NUMA
1980 static struct kmem_cache kmem_node_cache;
1981 #ifdef CONFIG_SMP
1982 static DEFINE_PER_CPU(struct kmem_cache_cpu, kmem_node_cpus);
1983 #endif
1984 static struct kmem_cache_node kmem_node_nodes[MAX_NUMNODES]; /*XXX per-nid */
1985 #endif
1986
1987 #ifdef CONFIG_SMP
1988 static struct kmem_cache_cpu *alloc_kmem_cache_cpu(struct kmem_cache *s,
1989                                 int cpu)
1990 {
1991         struct kmem_cache_cpu *c;
1992         int node;
1993
1994         node = cpu_to_node(cpu);
1995
1996         c = kmem_cache_alloc_node(&kmem_cpu_cache, GFP_KERNEL, node);
1997         if (!c)
1998                 return NULL;
1999
2000         init_kmem_cache_cpu(s, c);
2001         return c;
2002 }
2003
2004 static void free_kmem_cache_cpus(struct kmem_cache *s)
2005 {
2006         int cpu;
2007
2008         for_each_online_cpu(cpu) {
2009                 struct kmem_cache_cpu *c;
2010
2011                 c = s->cpu_slab[cpu];
2012                 if (c) {
2013                         kmem_cache_free(&kmem_cpu_cache, c);
2014                         s->cpu_slab[cpu] = NULL;
2015                 }
2016         }
2017 }
2018
2019 static int alloc_kmem_cache_cpus(struct kmem_cache *s)
2020 {
2021         int cpu;
2022
2023         for_each_online_cpu(cpu) {
2024                 struct kmem_cache_cpu *c;
2025
2026                 c = s->cpu_slab[cpu];
2027                 if (c)
2028                         continue;
2029
2030                 c = alloc_kmem_cache_cpu(s, cpu);
2031                 if (!c) {
2032                         free_kmem_cache_cpus(s);
2033                         return 0;
2034                 }
2035                 s->cpu_slab[cpu] = c;
2036         }
2037         return 1;
2038 }
2039
2040 #else
2041 static inline void free_kmem_cache_cpus(struct kmem_cache *s)
2042 {
2043 }
2044
2045 static inline int alloc_kmem_cache_cpus(struct kmem_cache *s)
2046 {
2047         init_kmem_cache_cpu(s, &s->cpu_slab);
2048         return 1;
2049 }
2050 #endif
2051
2052 #ifdef CONFIG_NUMA
2053 static void free_kmem_cache_nodes(struct kmem_cache *s)
2054 {
2055         int node;
2056
2057         for_each_node_state(node, N_NORMAL_MEMORY) {
2058                 struct kmem_cache_node *n;
2059
2060                 n = s->node_slab[node];
2061                 if (n) {
2062                         kmem_cache_free(&kmem_node_cache, n);
2063                         s->node_slab[node] = NULL;
2064                 }
2065         }
2066 }
2067
2068 static int alloc_kmem_cache_nodes(struct kmem_cache *s)
2069 {
2070         int node;
2071
2072         for_each_node_state(node, N_NORMAL_MEMORY) {
2073                 struct kmem_cache_node *n;
2074
2075                 n = kmem_cache_alloc_node(&kmem_node_cache, GFP_KERNEL, node);
2076                 if (!n) {
2077                         free_kmem_cache_nodes(s);
2078                         return 0;
2079                 }
2080                 init_kmem_cache_node(s, n);
2081                 s->node_slab[node] = n;
2082         }
2083         return 1;
2084 }
2085 #else
2086 static void free_kmem_cache_nodes(struct kmem_cache *s)
2087 {
2088 }
2089
2090 static int alloc_kmem_cache_nodes(struct kmem_cache *s)
2091 {
2092         return 1;
2093 }
2094 #endif
2095
2096 /*
2097  * calculate_sizes() determines the order and the distribution of data within
2098  * a slab object.
2099  */
2100 static int calculate_sizes(struct kmem_cache *s)
2101 {
2102         unsigned long flags = s->flags;
2103         unsigned long size = s->objsize;
2104         unsigned long align = s->align;
2105
2106         /*
2107          * Determine if we can poison the object itself. If the user of
2108          * the slab may touch the object after free or before allocation
2109          * then we should never poison the object itself.
2110          */
2111         if (slab_poison(s) && !(flags & SLAB_DESTROY_BY_RCU) && !s->ctor)
2112                 s->flags |= __OBJECT_POISON;
2113         else
2114                 s->flags &= ~__OBJECT_POISON;
2115
2116         /*
2117          * Round up object size to the next word boundary. We can only
2118          * place the free pointer at word boundaries and this determines
2119          * the possible location of the free pointer.
2120          */
2121         size = ALIGN(size, sizeof(void *));
2122
2123 #ifdef CONFIG_SLQB_DEBUG
2124         /*
2125          * If we are Redzoning then check if there is some space between the
2126          * end of the object and the free pointer. If not then add an
2127          * additional word to have some bytes to store Redzone information.
2128          */
2129         if ((flags & SLAB_RED_ZONE) && size == s->objsize)
2130                 size += sizeof(void *);
2131 #endif
2132
2133         /*
2134          * With that we have determined the number of bytes in actual use
2135          * by the object. This is the potential offset to the free pointer.
2136          */
2137         s->inuse = size;
2138
2139         if (((flags & (SLAB_DESTROY_BY_RCU | SLAB_POISON)) || s->ctor)) {
2140                 /*
2141                  * Relocate free pointer after the object if it is not
2142                  * permitted to overwrite the first word of the object on
2143                  * kmem_cache_free.
2144                  *
2145                  * This is the case if we do RCU, have a constructor or
2146                  * destructor or are poisoning the objects.
2147                  */
2148                 s->offset = size;
2149                 size += sizeof(void *);
2150         }
2151
2152 #ifdef CONFIG_SLQB_DEBUG
2153         if (flags & SLAB_STORE_USER) {
2154                 /*
2155                  * Need to store information about allocs and frees after
2156                  * the object.
2157                  */
2158                 size += 2 * sizeof(struct track);
2159         }
2160
2161         if (flags & SLAB_RED_ZONE) {
2162                 /*
2163                  * Add some empty padding so that we can catch
2164                  * overwrites from earlier objects rather than let
2165                  * tracking information or the free pointer be
2166                  * corrupted if an user writes before the start
2167                  * of the object.
2168                  */
2169                 size += sizeof(void *);
2170         }
2171 #endif
2172
2173         /*
2174          * Determine the alignment based on various parameters that the
2175          * user specified and the dynamic determination of cache line size
2176          * on bootup.
2177          */
2178         align = calculate_alignment(flags, align, s->objsize);
2179
2180         /*
2181          * SLQB stores one object immediately after another beginning from
2182          * offset 0. In order to align the objects we have to simply size
2183          * each object to conform to the alignment.
2184          */
2185         size = ALIGN(size, align);
2186         s->size = size;
2187         s->order = calculate_order(size);
2188
2189         if (s->order < 0)
2190                 return 0;
2191
2192         s->allocflags = 0;
2193         if (s->order)
2194                 s->allocflags |= __GFP_COMP;
2195
2196         if (s->flags & SLAB_CACHE_DMA)
2197                 s->allocflags |= SLQB_DMA;
2198
2199         if (s->flags & SLAB_RECLAIM_ACCOUNT)
2200                 s->allocflags |= __GFP_RECLAIMABLE;
2201
2202         /*
2203          * Determine the number of objects per slab
2204          */
2205         s->objects = (PAGE_SIZE << s->order) / size;
2206
2207         s->freebatch = max(4UL*PAGE_SIZE / size,
2208                                 min(256UL, 64*PAGE_SIZE / size));
2209         if (!s->freebatch)
2210                 s->freebatch = 1;
2211         s->hiwater = s->freebatch << 2;
2212
2213         return !!s->objects;
2214
2215 }
2216
2217 #ifdef CONFIG_SMP
2218 /*
2219  * Per-cpu allocator can't be used because it always uses slab allocator,
2220  * and it can't do per-node allocations.
2221  */
2222 static void *kmem_cache_dyn_array_alloc(int ids)
2223 {
2224         size_t size = sizeof(void *) * ids;
2225
2226         BUG_ON(!size);
2227
2228         if (unlikely(!slab_is_available())) {
2229                 static void *nextmem;
2230                 static size_t nextleft;
2231                 void *ret;
2232
2233                 /*
2234                  * Special case for setting up initial caches. These will
2235                  * never get freed by definition so we can do it rather
2236                  * simply.
2237                  */
2238                 if (size > nextleft) {
2239                         nextmem = alloc_pages_exact(size, GFP_KERNEL);
2240                         if (!nextmem)
2241                                 return NULL;
2242                         nextleft = roundup(size, PAGE_SIZE);
2243                 }
2244
2245                 ret = nextmem;
2246                 nextleft -= size;
2247                 nextmem += size;
2248                 memset(ret, 0, size);
2249                 return ret;
2250         } else {
2251                 return kzalloc(size, GFP_KERNEL);
2252         }
2253 }
2254
2255 static void kmem_cache_dyn_array_free(void *array)
2256 {
2257         if (unlikely(!slab_is_available()))
2258                 return; /* error case without crashing here (will panic soon) */
2259         kfree(array);
2260 }
2261 #endif
2262
2263 /*
2264  * Except in early boot, this should be called with slqb_lock held for write
2265  * to lock out hotplug, and protect list modifications.
2266  */
2267 static int kmem_cache_open(struct kmem_cache *s,
2268                         const char *name, size_t size, size_t align,
2269                         unsigned long flags, void (*ctor)(void *), int alloc)
2270 {
2271         unsigned int left_over;
2272
2273         memset(s, 0, sizeof(struct kmem_cache));
2274         s->name = name;
2275         s->ctor = ctor;
2276         s->objsize = size;
2277         s->align = align;
2278         s->flags = kmem_cache_flags(size, flags, name, ctor);
2279
2280         if (!calculate_sizes(s))
2281                 goto error;
2282
2283         if (!slab_debug(s)) {
2284                 left_over = (PAGE_SIZE << s->order) - (s->objects * s->size);
2285                 s->colour_off = max(cache_line_size(), s->align);
2286                 s->colour_range = left_over;
2287         } else {
2288                 s->colour_off = 0;
2289                 s->colour_range = 0;
2290         }
2291
2292 #ifdef CONFIG_SMP
2293         s->cpu_slab = kmem_cache_dyn_array_alloc(nr_cpu_ids);
2294         if (!s->cpu_slab)
2295                 goto error;
2296 # ifdef CONFIG_NUMA
2297         s->node_slab = kmem_cache_dyn_array_alloc(nr_node_ids);
2298         if (!s->node_slab)
2299                 goto error_cpu_array;
2300 # endif
2301 #endif
2302
2303         if (likely(alloc)) {
2304                 if (!alloc_kmem_cache_nodes(s))
2305                         goto error_node_array;
2306
2307                 if (!alloc_kmem_cache_cpus(s))
2308                         goto error_nodes;
2309         }
2310
2311         sysfs_slab_add(s);
2312         list_add(&s->list, &slab_caches);
2313
2314         return 1;
2315
2316 error_nodes:
2317         free_kmem_cache_nodes(s);
2318 error_node_array:
2319 #if defined(CONFIG_NUMA) && defined(CONFIG_SMP)
2320         kmem_cache_dyn_array_free(s->node_slab);
2321 error_cpu_array:
2322 #endif
2323 #ifdef CONFIG_SMP
2324         kmem_cache_dyn_array_free(s->cpu_slab);
2325 #endif
2326 error:
2327         if (flags & SLAB_PANIC)
2328                 panic("%s: failed to create slab `%s'\n", __func__, name);
2329         return 0;
2330 }
2331
2332 /**
2333  * kmem_ptr_validate - check if an untrusted pointer might be a slab entry.
2334  * @s: the cache we're checking against
2335  * @ptr: pointer to validate
2336  *
2337  * This verifies that the untrusted pointer looks sane;
2338  * it is _not_ a guarantee that the pointer is actually
2339  * part of the slab cache in question, but it at least
2340  * validates that the pointer can be dereferenced and
2341  * looks half-way sane.
2342  *
2343  * Currently only used for dentry validation.
2344  */
2345 int kmem_ptr_validate(struct kmem_cache *s, const void *ptr)
2346 {
2347         unsigned long addr = (unsigned long)ptr;
2348         struct slqb_page *page;
2349
2350         if (unlikely(addr < PAGE_OFFSET))
2351                 goto out;
2352         if (unlikely(addr > (unsigned long)high_memory - s->size))
2353                 goto out;
2354         if (unlikely(!IS_ALIGNED(addr, s->align)))
2355                 goto out;
2356         if (unlikely(!kern_addr_valid(addr)))
2357                 goto out;
2358         if (unlikely(!kern_addr_valid(addr + s->size - 1)))
2359                 goto out;
2360         if (unlikely(!pfn_valid(addr >> PAGE_SHIFT)))
2361                 goto out;
2362         page = virt_to_head_slqb_page(ptr);
2363         if (unlikely(!(page->flags & PG_SLQB_BIT)))
2364                 goto out;
2365         if (unlikely(page->list->cache != s)) /* XXX: ouch, racy */
2366                 goto out;
2367         return 1;
2368 out:
2369         return 0;
2370 }
2371 EXPORT_SYMBOL(kmem_ptr_validate);
2372
2373 /*
2374  * Determine the size of a slab object
2375  */
2376 unsigned int kmem_cache_size(struct kmem_cache *s)
2377 {
2378         return s->objsize;
2379 }
2380 EXPORT_SYMBOL(kmem_cache_size);
2381
2382 const char *kmem_cache_name(struct kmem_cache *s)
2383 {
2384         return s->name;
2385 }
2386 EXPORT_SYMBOL(kmem_cache_name);
2387
2388 /*
2389  * Release all resources used by a slab cache. No more concurrency on the
2390  * slab, so we can touch remote kmem_cache_cpu structures.
2391  */
2392 void kmem_cache_destroy(struct kmem_cache *s)
2393 {
2394 #ifdef CONFIG_NUMA
2395         int node;
2396 #endif
2397         int cpu;
2398
2399         down_write(&slqb_lock);
2400         list_del(&s->list);
2401
2402         local_irq_disable();
2403 #ifdef CONFIG_SMP
2404         for_each_online_cpu(cpu) {
2405                 struct kmem_cache_cpu *c = get_cpu_slab(s, cpu);
2406                 struct kmem_cache_list *l = &c->list;
2407
2408                 flush_free_list_all(s, l);
2409                 flush_remote_free_cache(s, c);
2410         }
2411 #endif
2412
2413         for_each_online_cpu(cpu) {
2414                 struct kmem_cache_cpu *c = get_cpu_slab(s, cpu);
2415                 struct kmem_cache_list *l = &c->list;
2416
2417                 claim_remote_free_list(s, l);
2418                 flush_free_list_all(s, l);
2419
2420                 WARN_ON(l->freelist.nr);
2421                 WARN_ON(l->nr_slabs);
2422                 WARN_ON(l->nr_partial);
2423         }
2424
2425         free_kmem_cache_cpus(s);
2426
2427 #ifdef CONFIG_NUMA
2428         for_each_node_state(node, N_NORMAL_MEMORY) {
2429                 struct kmem_cache_node *n;
2430                 struct kmem_cache_list *l;
2431
2432                 n = s->node_slab[node];
2433                 if (!n)
2434                         continue;
2435                 l = &n->list;
2436
2437                 claim_remote_free_list(s, l);
2438                 flush_free_list_all(s, l);
2439
2440                 WARN_ON(l->freelist.nr);
2441                 WARN_ON(l->nr_slabs);
2442                 WARN_ON(l->nr_partial);
2443         }
2444
2445         free_kmem_cache_nodes(s);
2446 #endif
2447         local_irq_enable();
2448
2449         sysfs_slab_remove(s);
2450         up_write(&slqb_lock);
2451 }
2452 EXPORT_SYMBOL(kmem_cache_destroy);
2453
2454 /********************************************************************
2455  *              Kmalloc subsystem
2456  *******************************************************************/
2457
2458 struct kmem_cache kmalloc_caches[KMALLOC_SHIFT_SLQB_HIGH + 1] __cacheline_aligned;
2459 EXPORT_SYMBOL(kmalloc_caches);
2460
2461 #ifdef CONFIG_ZONE_DMA
2462 struct kmem_cache kmalloc_caches_dma[KMALLOC_SHIFT_SLQB_HIGH + 1] __cacheline_aligned;
2463 EXPORT_SYMBOL(kmalloc_caches_dma);
2464 #endif
2465
2466 #ifndef ARCH_KMALLOC_FLAGS
2467 #define ARCH_KMALLOC_FLAGS SLAB_HWCACHE_ALIGN
2468 #endif
2469
2470 static struct kmem_cache *open_kmalloc_cache(struct kmem_cache *s,
2471                                 const char *name, int size, gfp_t gfp_flags)
2472 {
2473         unsigned int flags = ARCH_KMALLOC_FLAGS | SLAB_PANIC;
2474
2475         if (gfp_flags & SLQB_DMA)
2476                 flags |= SLAB_CACHE_DMA;
2477
2478         kmem_cache_open(s, name, size, ARCH_KMALLOC_MINALIGN, flags, NULL, 1);
2479
2480         return s;
2481 }
2482
2483 /*
2484  * Conversion table for small slabs sizes / 8 to the index in the
2485  * kmalloc array. This is necessary for slabs < 192 since we have non power
2486  * of two cache sizes there. The size of larger slabs can be determined using
2487  * fls.
2488  */
2489 static s8 size_index[24] __cacheline_aligned = {
2490         3,      /* 8 */
2491         4,      /* 16 */
2492         5,      /* 24 */
2493         5,      /* 32 */
2494         6,      /* 40 */
2495         6,      /* 48 */
2496         6,      /* 56 */
2497         6,      /* 64 */
2498 #if L1_CACHE_BYTES < 64
2499         1,      /* 72 */
2500         1,      /* 80 */
2501         1,      /* 88 */
2502         1,      /* 96 */
2503 #else
2504         7,
2505         7,
2506         7,
2507         7,
2508 #endif
2509         7,      /* 104 */
2510         7,      /* 112 */
2511         7,      /* 120 */
2512         7,      /* 128 */
2513 #if L1_CACHE_BYTES < 128
2514         2,      /* 136 */
2515         2,      /* 144 */
2516         2,      /* 152 */
2517         2,      /* 160 */
2518         2,      /* 168 */
2519         2,      /* 176 */
2520         2,      /* 184 */
2521         2       /* 192 */
2522 #else
2523         -1,
2524         -1,
2525         -1,
2526         -1,
2527         -1,
2528         -1,
2529         -1,
2530         -1
2531 #endif
2532 };
2533
2534 static struct kmem_cache *get_slab(size_t size, gfp_t flags)
2535 {
2536         int index;
2537
2538         if (unlikely(size <= KMALLOC_MIN_SIZE)) {
2539                 if (unlikely(!size))
2540                         return ZERO_SIZE_PTR;
2541
2542                 index = KMALLOC_SHIFT_LOW;
2543                 goto got_index;
2544         }
2545
2546 #if L1_CACHE_BYTES >= 128
2547         if (size <= 128) {
2548 #else
2549         if (size <= 192) {
2550 #endif
2551                 index = size_index[(size - 1) / 8];
2552         } else {
2553                 if (unlikely(size > 1UL << KMALLOC_SHIFT_SLQB_HIGH))
2554                         return NULL;
2555
2556                 index = fls(size - 1);
2557         }
2558
2559 got_index:
2560         if (unlikely((flags & SLQB_DMA)))
2561                 return &kmalloc_caches_dma[index];
2562         else
2563                 return &kmalloc_caches[index];
2564 }
2565
2566 void *__kmalloc(size_t size, gfp_t flags)
2567 {
2568         struct kmem_cache *s;
2569
2570         s = get_slab(size, flags);
2571         if (unlikely(ZERO_OR_NULL_PTR(s)))
2572                 return s;
2573
2574         return __kmem_cache_alloc(s, flags, _RET_IP_);
2575 }
2576 EXPORT_SYMBOL(__kmalloc);
2577
2578 #ifdef CONFIG_NUMA
2579 void *__kmalloc_node(size_t size, gfp_t flags, int node)
2580 {
2581         struct kmem_cache *s;
2582
2583         s = get_slab(size, flags);
2584         if (unlikely(ZERO_OR_NULL_PTR(s)))
2585                 return s;
2586
2587         return kmem_cache_alloc_node(s, flags, node);
2588 }
2589 EXPORT_SYMBOL(__kmalloc_node);
2590 #endif
2591
2592 size_t ksize(const void *object)
2593 {
2594         struct slqb_page *page;
2595         struct kmem_cache *s;
2596
2597         BUG_ON(!object);
2598         if (unlikely(object == ZERO_SIZE_PTR))
2599                 return 0;
2600
2601         page = virt_to_head_slqb_page(object);
2602         BUG_ON(!(page->flags & PG_SLQB_BIT));
2603
2604         s = page->list->cache;
2605
2606         /*
2607          * Debugging requires use of the padding between object
2608          * and whatever may come after it.
2609          */
2610         if (s->flags & (SLAB_RED_ZONE | SLAB_POISON))
2611                 return s->objsize;
2612
2613         /*
2614          * If we have the need to store the freelist pointer
2615          * back there or track user information then we can
2616          * only use the space before that information.
2617          */
2618         if (s->flags & (SLAB_DESTROY_BY_RCU | SLAB_STORE_USER))
2619                 return s->inuse;
2620
2621         /*
2622          * Else we can use all the padding etc for the allocation
2623          */
2624         return s->size;
2625 }
2626 EXPORT_SYMBOL(ksize);
2627
2628 void kfree(const void *object)
2629 {
2630         struct kmem_cache *s;
2631         struct slqb_page *page;
2632
2633         if (unlikely(ZERO_OR_NULL_PTR(object)))
2634                 return;
2635
2636         page = virt_to_head_slqb_page(object);
2637         s = page->list->cache;
2638
2639         slab_free(s, page, (void *)object);
2640 }
2641 EXPORT_SYMBOL(kfree);
2642
2643 static void kmem_cache_trim_percpu(void *arg)
2644 {
2645         int cpu = smp_processor_id();
2646         struct kmem_cache *s = arg;
2647         struct kmem_cache_cpu *c = get_cpu_slab(s, cpu);
2648         struct kmem_cache_list *l = &c->list;
2649
2650         claim_remote_free_list(s, l);
2651         flush_free_list(s, l);
2652 #ifdef CONFIG_SMP
2653         flush_remote_free_cache(s, c);
2654 #endif
2655 }
2656
2657 int kmem_cache_shrink(struct kmem_cache *s)
2658 {
2659 #ifdef CONFIG_NUMA
2660         int node;
2661 #endif
2662
2663         on_each_cpu(kmem_cache_trim_percpu, s, 1);
2664
2665 #ifdef CONFIG_NUMA
2666         for_each_node_state(node, N_NORMAL_MEMORY) {
2667                 struct kmem_cache_node *n;
2668                 struct kmem_cache_list *l;
2669
2670                 n = s->node_slab[node];
2671                 if (!n)
2672                         continue;
2673                 l = &n->list;
2674
2675                 spin_lock_irq(&n->list_lock);
2676                 claim_remote_free_list(s, l);
2677                 flush_free_list(s, l);
2678                 spin_unlock_irq(&n->list_lock);
2679         }
2680 #endif
2681
2682         return 0;
2683 }
2684 EXPORT_SYMBOL(kmem_cache_shrink);
2685
2686 #if defined(CONFIG_NUMA) && defined(CONFIG_MEMORY_HOTPLUG)
2687 static void kmem_cache_reap_percpu(void *arg)
2688 {
2689         int cpu = smp_processor_id();
2690         struct kmem_cache *s;
2691         long phase = (long)arg;
2692
2693         list_for_each_entry(s, &slab_caches, list) {
2694                 struct kmem_cache_cpu *c = get_cpu_slab(s, cpu);
2695                 struct kmem_cache_list *l = &c->list;
2696
2697                 if (phase == 0) {
2698                         flush_free_list_all(s, l);
2699                         flush_remote_free_cache(s, c);
2700                 }
2701
2702                 if (phase == 1) {
2703                         claim_remote_free_list(s, l);
2704                         flush_free_list_all(s, l);
2705                 }
2706         }
2707 }
2708
2709 static void kmem_cache_reap(void)
2710 {
2711         struct kmem_cache *s;
2712         int node;
2713
2714         down_read(&slqb_lock);
2715         on_each_cpu(kmem_cache_reap_percpu, (void *)0, 1);
2716         on_each_cpu(kmem_cache_reap_percpu, (void *)1, 1);
2717
2718         list_for_each_entry(s, &slab_caches, list) {
2719                 for_each_node_state(node, N_NORMAL_MEMORY) {
2720                         struct kmem_cache_node *n;
2721                         struct kmem_cache_list *l;
2722
2723                         n = s->node_slab[node];
2724                         if (!n)
2725                                 continue;
2726                         l = &n->list;
2727
2728                         spin_lock_irq(&n->list_lock);
2729                         claim_remote_free_list(s, l);
2730                         flush_free_list_all(s, l);
2731                         spin_unlock_irq(&n->list_lock);
2732                 }
2733         }
2734         up_read(&slqb_lock);
2735 }
2736 #endif
2737
2738 static void cache_trim_worker(struct work_struct *w)
2739 {
2740         struct delayed_work *work =
2741                 container_of(w, struct delayed_work, work);
2742         struct kmem_cache *s;
2743
2744         if (!down_read_trylock(&slqb_lock))
2745                 goto out;
2746
2747         list_for_each_entry(s, &slab_caches, list) {
2748 #ifdef CONFIG_NUMA
2749                 int node = numa_node_id();
2750                 struct kmem_cache_node *n = s->node_slab[node];
2751
2752                 if (n) {
2753                         struct kmem_cache_list *l = &n->list;
2754
2755                         spin_lock_irq(&n->list_lock);
2756                         claim_remote_free_list(s, l);
2757                         flush_free_list(s, l);
2758                         spin_unlock_irq(&n->list_lock);
2759                 }
2760 #endif
2761
2762                 local_irq_disable();
2763                 kmem_cache_trim_percpu(s);
2764                 local_irq_enable();
2765         }
2766
2767         up_read(&slqb_lock);
2768 out:
2769         schedule_delayed_work(work, round_jiffies_relative(3*HZ));
2770 }
2771
2772 static DEFINE_PER_CPU(struct delayed_work, slqb_cache_trim_work);
2773
2774 static void __cpuinit start_cpu_timer(int cpu)
2775 {
2776         struct delayed_work *cache_trim_work = &per_cpu(slqb_cache_trim_work,
2777                         cpu);
2778
2779         /*
2780          * When this gets called from do_initcalls via cpucache_init(),
2781          * init_workqueues() has already run, so keventd will be setup
2782          * at that time.
2783          */
2784         if (keventd_up() && cache_trim_work->work.func == NULL) {
2785                 INIT_DELAYED_WORK(cache_trim_work, cache_trim_worker);
2786                 schedule_delayed_work_on(cpu, cache_trim_work,
2787                                         __round_jiffies_relative(HZ, cpu));
2788         }
2789 }
2790
2791 static int __init cpucache_init(void)
2792 {
2793         int cpu;
2794
2795         for_each_online_cpu(cpu)
2796                 start_cpu_timer(cpu);
2797
2798         return 0;
2799 }
2800 device_initcall(cpucache_init);
2801
2802 #if defined(CONFIG_NUMA) && defined(CONFIG_MEMORY_HOTPLUG)
2803 static void slab_mem_going_offline_callback(void *arg)
2804 {
2805         kmem_cache_reap();
2806 }
2807
2808 static void slab_mem_offline_callback(void *arg)
2809 {
2810         /* XXX: should release structures, see CPU offline comment */
2811 }
2812
2813 static int slab_mem_going_online_callback(void *arg)
2814 {
2815         struct kmem_cache *s;
2816         struct kmem_cache_node *n;
2817         struct memory_notify *marg = arg;
2818         int nid = marg->status_change_nid;
2819         int ret = 0;
2820
2821         /*
2822          * If the node's memory is already available, then kmem_cache_node is
2823          * already created. Nothing to do.
2824          */
2825         if (nid < 0)
2826                 return 0;
2827
2828         /*
2829          * We are bringing a node online. No memory is availabe yet. We must
2830          * allocate a kmem_cache_node structure in order to bring the node
2831          * online.
2832          */
2833         down_write(&slqb_lock);
2834         list_for_each_entry(s, &slab_caches, list) {
2835                 /*
2836                  * XXX: kmem_cache_alloc_node will fallback to other nodes
2837                  *      since memory is not yet available from the node that
2838                  *      is brought up.
2839                  */
2840                 if (s->node_slab[nid]) /* could be lefover from last online */
2841                         continue;
2842                 n = kmem_cache_alloc(&kmem_node_cache, GFP_KERNEL);
2843                 if (!n) {
2844                         ret = -ENOMEM;
2845                         goto out;
2846                 }
2847                 init_kmem_cache_node(s, n);
2848                 s->node_slab[nid] = n;
2849         }
2850 out:
2851         up_write(&slqb_lock);
2852         return ret;
2853 }
2854
2855 static int slab_memory_callback(struct notifier_block *self,
2856                                 unsigned long action, void *arg)
2857 {
2858         int ret = 0;
2859
2860         switch (action) {
2861         case MEM_GOING_ONLINE:
2862                 ret = slab_mem_going_online_callback(arg);
2863                 break;
2864         case MEM_GOING_OFFLINE:
2865                 slab_mem_going_offline_callback(arg);
2866                 break;
2867         case MEM_OFFLINE:
2868         case MEM_CANCEL_ONLINE:
2869                 slab_mem_offline_callback(arg);
2870                 break;
2871         case MEM_ONLINE:
2872         case MEM_CANCEL_OFFLINE:
2873                 break;
2874         }
2875
2876         if (ret)
2877                 ret = notifier_from_errno(ret);
2878         else
2879                 ret = NOTIFY_OK;
2880         return ret;
2881 }
2882
2883 #endif /* CONFIG_MEMORY_HOTPLUG */
2884
2885 /********************************************************************
2886  *                      Basic setup of slabs
2887  *******************************************************************/
2888
2889 void __init kmem_cache_init(void)
2890 {
2891         int i;
2892         unsigned int flags = SLAB_HWCACHE_ALIGN|SLAB_PANIC;
2893
2894         /*
2895          * All the ifdefs are rather ugly here, but it's just the setup code,
2896          * so it doesn't have to be too readable :)
2897          */
2898
2899         /*
2900          * No need to take slqb_lock here: there should be no concurrency
2901          * anyway, and spin_unlock_irq in rwsem code could enable interrupts
2902          * too early.
2903          */
2904         kmem_cache_open(&kmem_cache_cache, "kmem_cache",
2905                         sizeof(struct kmem_cache), 0, flags, NULL, 0);
2906 #ifdef CONFIG_SMP
2907         kmem_cache_open(&kmem_cpu_cache, "kmem_cache_cpu",
2908                         sizeof(struct kmem_cache_cpu), 0, flags, NULL, 0);
2909 #endif
2910 #ifdef CONFIG_NUMA
2911         kmem_cache_open(&kmem_node_cache, "kmem_cache_node",
2912                         sizeof(struct kmem_cache_node), 0, flags, NULL, 0);
2913 #endif
2914
2915 #ifdef CONFIG_SMP
2916         for_each_possible_cpu(i) {
2917                 struct kmem_cache_cpu *c;
2918
2919                 c = &per_cpu(kmem_cache_cpus, i);
2920                 init_kmem_cache_cpu(&kmem_cache_cache, c);
2921                 kmem_cache_cache.cpu_slab[i] = c;
2922
2923                 c = &per_cpu(kmem_cpu_cpus, i);
2924                 init_kmem_cache_cpu(&kmem_cpu_cache, c);
2925                 kmem_cpu_cache.cpu_slab[i] = c;
2926
2927 #ifdef CONFIG_NUMA
2928                 c = &per_cpu(kmem_node_cpus, i);
2929                 init_kmem_cache_cpu(&kmem_node_cache, c);
2930                 kmem_node_cache.cpu_slab[i] = c;
2931 #endif
2932         }
2933 #else
2934         init_kmem_cache_cpu(&kmem_cache_cache, &kmem_cache_cache.cpu_slab);
2935 #endif
2936
2937 #ifdef CONFIG_NUMA
2938         for_each_node_state(i, N_NORMAL_MEMORY) {
2939                 struct kmem_cache_node *n;
2940
2941                 n = &kmem_cache_nodes[i];
2942                 init_kmem_cache_node(&kmem_cache_cache, n);
2943                 kmem_cache_cache.node_slab[i] = n;
2944 #ifdef CONFIG_SMP
2945                 n = &kmem_cpu_nodes[i];
2946                 init_kmem_cache_node(&kmem_cpu_cache, n);
2947                 kmem_cpu_cache.node_slab[i] = n;
2948 #endif
2949                 n = &kmem_node_nodes[i];
2950                 init_kmem_cache_node(&kmem_node_cache, n);
2951                 kmem_node_cache.node_slab[i] = n;
2952         }
2953 #endif
2954
2955         /* Caches that are not of the two-to-the-power-of size */
2956         if (L1_CACHE_BYTES < 64 && KMALLOC_MIN_SIZE <= 64) {
2957                 open_kmalloc_cache(&kmalloc_caches[1],
2958                                 "kmalloc-96", 96, GFP_KERNEL);
2959 #ifdef CONFIG_ZONE_DMA
2960                 open_kmalloc_cache(&kmalloc_caches_dma[1],
2961                                 "kmalloc_dma-96", 96, GFP_KERNEL|SLQB_DMA);
2962 #endif
2963         }
2964         if (L1_CACHE_BYTES < 128 && KMALLOC_MIN_SIZE <= 128) {
2965                 open_kmalloc_cache(&kmalloc_caches[2],
2966                                 "kmalloc-192", 192, GFP_KERNEL);
2967 #ifdef CONFIG_ZONE_DMA
2968                 open_kmalloc_cache(&kmalloc_caches_dma[2],
2969                                 "kmalloc_dma-192", 192, GFP_KERNEL|SLQB_DMA);
2970 #endif
2971         }
2972
2973         for (i = KMALLOC_SHIFT_LOW; i <= KMALLOC_SHIFT_SLQB_HIGH; i++) {
2974                 open_kmalloc_cache(&kmalloc_caches[i],
2975                                 "kmalloc", 1 << i, GFP_KERNEL);
2976 #ifdef CONFIG_ZONE_DMA
2977                 open_kmalloc_cache(&kmalloc_caches_dma[i],
2978                                 "kmalloc_dma", 1 << i, GFP_KERNEL|SLQB_DMA);
2979 #endif
2980         }
2981
2982         /*
2983          * Patch up the size_index table if we have strange large alignment
2984          * requirements for the kmalloc array. This is only the case for
2985          * mips it seems. The standard arches will not generate any code here.
2986          *
2987          * Largest permitted alignment is 256 bytes due to the way we
2988          * handle the index determination for the smaller caches.
2989          *
2990          * Make sure that nothing crazy happens if someone starts tinkering
2991          * around with ARCH_KMALLOC_MINALIGN
2992          */
2993         BUILD_BUG_ON(KMALLOC_MIN_SIZE > 256 ||
2994                 (KMALLOC_MIN_SIZE & (KMALLOC_MIN_SIZE - 1)));
2995
2996         for (i = 8; i < KMALLOC_MIN_SIZE; i += 8)
2997                 size_index[(i - 1) / 8] = KMALLOC_SHIFT_LOW;
2998
2999         /* Provide the correct kmalloc names now that the caches are up */
3000         for (i = KMALLOC_SHIFT_LOW; i <= KMALLOC_SHIFT_SLQB_HIGH; i++) {
3001                 kmalloc_caches[i].name =
3002                         kasprintf(GFP_KERNEL, "kmalloc-%d", 1 << i);
3003 #ifdef CONFIG_ZONE_DMA
3004                 kmalloc_caches_dma[i].name =
3005                         kasprintf(GFP_KERNEL, "kmalloc_dma-%d", 1 << i);
3006 #endif
3007         }
3008
3009 #ifdef CONFIG_SMP
3010         register_cpu_notifier(&slab_notifier);
3011 #endif
3012 #ifdef CONFIG_NUMA
3013         hotplug_memory_notifier(slab_memory_callback, 1);
3014 #endif
3015         /*
3016          * smp_init() has not yet been called, so no worries about memory
3017          * ordering with __slab_is_available.
3018          */
3019         __slab_is_available = 1;
3020 }
3021
3022 void __init kmem_cache_init_late(void)
3023 {
3024 }
3025
3026 /*
3027  * Some basic slab creation sanity checks
3028  */
3029 static int kmem_cache_create_ok(const char *name, size_t size,
3030                 size_t align, unsigned long flags)
3031 {
3032         struct kmem_cache *tmp;
3033
3034         /*
3035          * Sanity checks... these are all serious usage bugs.
3036          */
3037         if (!name || in_interrupt() || (size < sizeof(void *))) {
3038                 printk(KERN_ERR "kmem_cache_create(): early error in slab %s\n",
3039                                 name);
3040                 dump_stack();
3041
3042                 return 0;
3043         }
3044
3045         list_for_each_entry(tmp, &slab_caches, list) {
3046                 char x;
3047                 int res;
3048
3049                 /*
3050                  * This happens when the module gets unloaded and doesn't
3051                  * destroy its slab cache and no-one else reuses the vmalloc
3052                  * area of the module.  Print a warning.
3053                  */
3054                 res = probe_kernel_address(tmp->name, x);
3055                 if (res) {
3056                         printk(KERN_ERR
3057                                "SLAB: cache with size %d has lost its name\n",
3058                                tmp->size);
3059                         continue;
3060                 }
3061
3062                 if (!strcmp(tmp->name, name)) {
3063                         printk(KERN_ERR
3064                                "SLAB: duplicate cache %s\n", name);
3065                         dump_stack();
3066
3067                         return 0;
3068                 }
3069         }
3070
3071         WARN_ON(strchr(name, ' '));     /* It confuses parsers */
3072         if (flags & SLAB_DESTROY_BY_RCU)
3073                 WARN_ON(flags & SLAB_POISON);
3074
3075         return 1;
3076 }
3077
3078 struct kmem_cache *kmem_cache_create(const char *name, size_t size,
3079                 size_t align, unsigned long flags, void (*ctor)(void *))
3080 {
3081         struct kmem_cache *s;
3082
3083         down_write(&slqb_lock);
3084         if (!kmem_cache_create_ok(name, size, align, flags))
3085                 goto err;
3086
3087         s = kmem_cache_alloc(&kmem_cache_cache, GFP_KERNEL);
3088         if (!s)
3089                 goto err;
3090
3091         if (kmem_cache_open(s, name, size, align, flags, ctor, 1)) {
3092                 up_write(&slqb_lock);
3093                 return s;
3094         }
3095
3096         kmem_cache_free(&kmem_cache_cache, s);
3097
3098 err:
3099         up_write(&slqb_lock);
3100         if (flags & SLAB_PANIC)
3101                 panic("%s: failed to create slab `%s'\n", __func__, name);
3102
3103         return NULL;
3104 }
3105 EXPORT_SYMBOL(kmem_cache_create);
3106
3107 #ifdef CONFIG_SMP
3108 /*
3109  * Use the cpu notifier to insure that the cpu slabs are flushed when
3110  * necessary.
3111  */
3112 static int __cpuinit slab_cpuup_callback(struct notifier_block *nfb,
3113                                 unsigned long action, void *hcpu)
3114 {
3115         long cpu = (long)hcpu;
3116         struct kmem_cache *s;
3117
3118         switch (action) {
3119         case CPU_UP_PREPARE:
3120         case CPU_UP_PREPARE_FROZEN:
3121                 down_write(&slqb_lock);
3122                 list_for_each_entry(s, &slab_caches, list) {
3123                         if (s->cpu_slab[cpu]) /* could be lefover last online */
3124                                 continue;
3125                         s->cpu_slab[cpu] = alloc_kmem_cache_cpu(s, cpu);
3126                         if (!s->cpu_slab[cpu]) {
3127                                 up_read(&slqb_lock);
3128                                 return NOTIFY_BAD;
3129                         }
3130                 }
3131                 up_write(&slqb_lock);
3132                 break;
3133
3134         case CPU_ONLINE:
3135         case CPU_ONLINE_FROZEN:
3136         case CPU_DOWN_FAILED:
3137         case CPU_DOWN_FAILED_FROZEN:
3138                 start_cpu_timer(cpu);
3139                 break;
3140
3141         case CPU_DOWN_PREPARE:
3142         case CPU_DOWN_PREPARE_FROZEN:
3143                 cancel_rearming_delayed_work(&per_cpu(slqb_cache_trim_work,
3144                                         cpu));
3145                 per_cpu(slqb_cache_trim_work, cpu).work.func = NULL;
3146                 break;
3147
3148         case CPU_UP_CANCELED:
3149         case CPU_UP_CANCELED_FROZEN:
3150         case CPU_DEAD:
3151         case CPU_DEAD_FROZEN:
3152                 /*
3153                  * XXX: Freeing here doesn't work because objects can still be
3154                  * on this CPU's list. periodic timer needs to check if a CPU
3155                  * is offline and then try to cleanup from there. Same for node
3156                  * offline.
3157                  */
3158         default:
3159                 break;
3160         }
3161         return NOTIFY_OK;
3162 }
3163
3164 static struct notifier_block __cpuinitdata slab_notifier = {
3165         .notifier_call = slab_cpuup_callback
3166 };
3167
3168 #endif
3169
3170 #ifdef CONFIG_SLQB_DEBUG
3171 void *__kmalloc_track_caller(size_t size, gfp_t flags, unsigned long caller)
3172 {
3173         struct kmem_cache *s;
3174         int node = -1;
3175
3176         s = get_slab(size, flags);
3177         if (unlikely(ZERO_OR_NULL_PTR(s)))
3178                 return s;
3179
3180 #ifdef CONFIG_NUMA
3181         if (unlikely(current->flags & (PF_SPREAD_SLAB | PF_MEMPOLICY)))
3182                 node = alternate_nid(s, flags, node);
3183 #endif
3184         return slab_alloc(s, flags, node, caller);
3185 }
3186
3187 void *__kmalloc_node_track_caller(size_t size, gfp_t flags, int node,
3188                                 unsigned long caller)
3189 {
3190         struct kmem_cache *s;
3191
3192         s = get_slab(size, flags);
3193         if (unlikely(ZERO_OR_NULL_PTR(s)))
3194                 return s;
3195
3196         return slab_alloc(s, flags, node, caller);
3197 }
3198 #endif
3199
3200 #if defined(CONFIG_SLQB_SYSFS) || defined(CONFIG_SLABINFO)
3201 struct stats_gather {
3202         struct kmem_cache *s;
3203         spinlock_t lock;
3204         unsigned long nr_slabs;
3205         unsigned long nr_partial;
3206         unsigned long nr_inuse;
3207         unsigned long nr_objects;
3208
3209 #ifdef CONFIG_SLQB_STATS
3210         unsigned long stats[NR_SLQB_STAT_ITEMS];
3211 #endif
3212 };
3213
3214 static void __gather_stats(void *arg)
3215 {
3216         unsigned long nr_slabs;
3217         unsigned long nr_partial;
3218         unsigned long nr_inuse;
3219         struct stats_gather *gather = arg;
3220         int cpu = smp_processor_id();
3221         struct kmem_cache *s = gather->s;
3222         struct kmem_cache_cpu *c = get_cpu_slab(s, cpu);
3223         struct kmem_cache_list *l = &c->list;
3224         struct slqb_page *page;
3225 #ifdef CONFIG_SLQB_STATS
3226         int i;
3227 #endif
3228
3229         spin_lock(&l->page_lock);
3230         nr_slabs = l->nr_slabs;
3231         nr_partial = l->nr_partial;
3232         nr_inuse = (nr_slabs - nr_partial) * s->objects;
3233
3234         list_for_each_entry(page, &l->partial, lru) {
3235                 nr_inuse += page->inuse;
3236         }
3237         spin_unlock(&l->page_lock);
3238
3239         spin_lock(&gather->lock);
3240         gather->nr_slabs += nr_slabs;
3241         gather->nr_partial += nr_partial;
3242         gather->nr_inuse += nr_inuse;
3243 #ifdef CONFIG_SLQB_STATS
3244         for (i = 0; i < NR_SLQB_STAT_ITEMS; i++)
3245                 gather->stats[i] += l->stats[i];
3246 #endif
3247         spin_unlock(&gather->lock);
3248 }
3249
3250 /* must be called with slqb_lock held */
3251 static void gather_stats_locked(struct kmem_cache *s,
3252                                 struct stats_gather *stats)
3253 {
3254 #ifdef CONFIG_NUMA
3255         int node;
3256 #endif
3257
3258         memset(stats, 0, sizeof(struct stats_gather));
3259         stats->s = s;
3260         spin_lock_init(&stats->lock);
3261
3262         on_each_cpu(__gather_stats, stats, 1);
3263
3264 #ifdef CONFIG_NUMA
3265         for_each_online_node(node) {
3266                 struct kmem_cache_node *n = s->node_slab[node];
3267                 struct kmem_cache_list *l = &n->list;
3268                 struct slqb_page *page;
3269                 unsigned long flags;
3270 #ifdef CONFIG_SLQB_STATS
3271                 int i;
3272 #endif
3273
3274                 spin_lock_irqsave(&n->list_lock, flags);
3275 #ifdef CONFIG_SLQB_STATS
3276                 for (i = 0; i < NR_SLQB_STAT_ITEMS; i++)
3277                         stats->stats[i] += l->stats[i];
3278 #endif
3279                 stats->nr_slabs += l->nr_slabs;
3280                 stats->nr_partial += l->nr_partial;
3281                 stats->nr_inuse += (l->nr_slabs - l->nr_partial) * s->objects;
3282
3283                 list_for_each_entry(page, &l->partial, lru) {
3284                         stats->nr_inuse += page->inuse;
3285                 }
3286                 spin_unlock_irqrestore(&n->list_lock, flags);
3287         }
3288 #endif
3289
3290         stats->nr_objects = stats->nr_slabs * s->objects;
3291 }
3292
3293 #ifdef CONFIG_SLQB_SYSFS
3294 static void gather_stats(struct kmem_cache *s, struct stats_gather *stats)
3295 {
3296         down_read(&slqb_lock); /* hold off hotplug */
3297         gather_stats_locked(s, stats);
3298         up_read(&slqb_lock);
3299 }
3300 #endif
3301 #endif
3302
3303 /*
3304  * The /proc/slabinfo ABI
3305  */
3306 #ifdef CONFIG_SLABINFO
3307 #include <linux/proc_fs.h>
3308 ssize_t slabinfo_write(struct file *file, const char __user * buffer,
3309                        size_t count, loff_t *ppos)
3310 {
3311         return -EINVAL;
3312 }
3313
3314 static void print_slabinfo_header(struct seq_file *m)
3315 {
3316         seq_puts(m, "slabinfo - version: 2.1\n");
3317         seq_puts(m, "# name         <active_objs> <num_objs> <objsize> "
3318                  "<objperslab> <pagesperslab>");
3319         seq_puts(m, " : tunables <limit> <batchcount> <sharedfactor>");
3320         seq_puts(m, " : slabdata <active_slabs> <num_slabs> <sharedavail>");
3321         seq_putc(m, '\n');
3322 }
3323
3324 static void *s_start(struct seq_file *m, loff_t *pos)
3325 {
3326         loff_t n = *pos;
3327
3328         down_read(&slqb_lock);
3329         if (!n)
3330                 print_slabinfo_header(m);
3331
3332         return seq_list_start(&slab_caches, *pos);
3333 }
3334
3335 static void *s_next(struct seq_file *m, void *p, loff_t *pos)
3336 {
3337         return seq_list_next(p, &slab_caches, pos);
3338 }
3339
3340 static void s_stop(struct seq_file *m, void *p)
3341 {
3342         up_read(&slqb_lock);
3343 }
3344
3345 static int s_show(struct seq_file *m, void *p)
3346 {
3347         struct stats_gather stats;
3348         struct kmem_cache *s;
3349
3350         s = list_entry(p, struct kmem_cache, list);
3351
3352         gather_stats_locked(s, &stats);
3353
3354         seq_printf(m, "%-17s %6lu %6lu %6u %4u %4d", s->name, stats.nr_inuse,
3355                         stats.nr_objects, s->size, s->objects, (1 << s->order));
3356         seq_printf(m, " : tunables %4u %4u %4u", slab_hiwater(s),
3357                         slab_freebatch(s), 0);
3358         seq_printf(m, " : slabdata %6lu %6lu %6lu", stats.nr_slabs,
3359                         stats.nr_slabs, 0UL);
3360         seq_putc(m, '\n');
3361         return 0;
3362 }
3363
3364 static const struct seq_operations slabinfo_op = {
3365         .start = s_start,
3366         .next = s_next,
3367         .stop = s_stop,
3368         .show = s_show,
3369 };
3370
3371 static int slabinfo_open(struct inode *inode, struct file *file)
3372 {
3373         return seq_open(file, &slabinfo_op);
3374 }
3375
3376 static const struct file_operations proc_slabinfo_operations = {
3377         .open           = slabinfo_open,
3378         .read           = seq_read,
3379         .llseek         = seq_lseek,
3380         .release        = seq_release,
3381 };
3382
3383 static int __init slab_proc_init(void)
3384 {
3385         proc_create("slabinfo", S_IWUSR|S_IRUGO, NULL,
3386                         &proc_slabinfo_operations);
3387         return 0;
3388 }
3389 module_init(slab_proc_init);
3390 #endif /* CONFIG_SLABINFO */
3391
3392 #ifdef CONFIG_SLQB_SYSFS
3393 /*
3394  * sysfs API
3395  */
3396 #define to_slab_attr(n) container_of(n, struct slab_attribute, attr)
3397 #define to_slab(n) container_of(n, struct kmem_cache, kobj);
3398
3399 struct slab_attribute {
3400         struct attribute attr;
3401         ssize_t (*show)(struct kmem_cache *s, char *buf);
3402         ssize_t (*store)(struct kmem_cache *s, const char *x, size_t count);
3403 };
3404
3405 #define SLAB_ATTR_RO(_name) \
3406         static struct slab_attribute _name##_attr = __ATTR_RO(_name)
3407
3408 #define SLAB_ATTR(_name) \
3409         static struct slab_attribute _name##_attr =  \
3410         __ATTR(_name, 0644, _name##_show, _name##_store)
3411
3412 static ssize_t slab_size_show(struct kmem_cache *s, char *buf)
3413 {
3414         return sprintf(buf, "%d\n", s->size);
3415 }
3416 SLAB_ATTR_RO(slab_size);
3417
3418 static ssize_t align_show(struct kmem_cache *s, char *buf)
3419 {
3420         return sprintf(buf, "%d\n", s->align);
3421 }
3422 SLAB_ATTR_RO(align);
3423
3424 static ssize_t object_size_show(struct kmem_cache *s, char *buf)
3425 {
3426         return sprintf(buf, "%d\n", s->objsize);
3427 }
3428 SLAB_ATTR_RO(object_size);
3429
3430 static ssize_t objs_per_slab_show(struct kmem_cache *s, char *buf)
3431 {
3432         return sprintf(buf, "%d\n", s->objects);
3433 }
3434 SLAB_ATTR_RO(objs_per_slab);
3435
3436 static ssize_t order_show(struct kmem_cache *s, char *buf)
3437 {
3438         return sprintf(buf, "%d\n", s->order);
3439 }
3440 SLAB_ATTR_RO(order);
3441
3442 static ssize_t ctor_show(struct kmem_cache *s, char *buf)
3443 {
3444         if (s->ctor) {
3445                 int n = sprint_symbol(buf, (unsigned long)s->ctor);
3446
3447                 return n + sprintf(buf + n, "\n");
3448         }
3449         return 0;
3450 }
3451 SLAB_ATTR_RO(ctor);
3452
3453 static ssize_t slabs_show(struct kmem_cache *s, char *buf)
3454 {
3455         struct stats_gather stats;
3456
3457         gather_stats(s, &stats);
3458
3459         return sprintf(buf, "%lu\n", stats.nr_slabs);
3460 }
3461 SLAB_ATTR_RO(slabs);
3462
3463 static ssize_t objects_show(struct kmem_cache *s, char *buf)
3464 {
3465         struct stats_gather stats;
3466
3467         gather_stats(s, &stats);
3468
3469         return sprintf(buf, "%lu\n", stats.nr_inuse);
3470 }
3471 SLAB_ATTR_RO(objects);
3472
3473 static ssize_t total_objects_show(struct kmem_cache *s, char *buf)
3474 {
3475         struct stats_gather stats;
3476
3477         gather_stats(s, &stats);
3478
3479         return sprintf(buf, "%lu\n", stats.nr_objects);
3480 }
3481 SLAB_ATTR_RO(total_objects);
3482
3483 #ifdef CONFIG_FAILSLAB
3484 static ssize_t failslab_show(struct kmem_cache *s, char *buf)
3485 {
3486         return sprintf(buf, "%d\n", !!(s->flags & SLAB_FAILSLAB));
3487 }
3488
3489 static ssize_t failslab_store(struct kmem_cache *s, const char *buf,
3490                                                         size_t length)
3491 {
3492         s->flags &= ~SLAB_FAILSLAB;
3493         if (buf[0] == '1')
3494                 s->flags |= SLAB_FAILSLAB;
3495         return length;
3496 }
3497 SLAB_ATTR(failslab);
3498 #endif
3499
3500 static ssize_t reclaim_account_show(struct kmem_cache *s, char *buf)
3501 {
3502         return sprintf(buf, "%d\n", !!(s->flags & SLAB_RECLAIM_ACCOUNT));
3503 }
3504 SLAB_ATTR_RO(reclaim_account);
3505
3506 static ssize_t hwcache_align_show(struct kmem_cache *s, char *buf)
3507 {
3508         return sprintf(buf, "%d\n", !!(s->flags & SLAB_HWCACHE_ALIGN));
3509 }
3510 SLAB_ATTR_RO(hwcache_align);
3511
3512 #ifdef CONFIG_ZONE_DMA
3513 static ssize_t cache_dma_show(struct kmem_cache *s, char *buf)
3514 {
3515         return sprintf(buf, "%d\n", !!(s->flags & SLAB_CACHE_DMA));
3516 }
3517 SLAB_ATTR_RO(cache_dma);
3518 #endif
3519
3520 static ssize_t destroy_by_rcu_show(struct kmem_cache *s, char *buf)
3521 {
3522         return sprintf(buf, "%d\n", !!(s->flags & SLAB_DESTROY_BY_RCU));
3523 }
3524 SLAB_ATTR_RO(destroy_by_rcu);
3525
3526 static ssize_t red_zone_show(struct kmem_cache *s, char *buf)
3527 {
3528         return sprintf(buf, "%d\n", !!(s->flags & SLAB_RED_ZONE));
3529 }
3530 SLAB_ATTR_RO(red_zone);
3531
3532 static ssize_t poison_show(struct kmem_cache *s, char *buf)
3533 {
3534         return sprintf(buf, "%d\n", !!(s->flags & SLAB_POISON));
3535 }
3536 SLAB_ATTR_RO(poison);
3537
3538 static ssize_t store_user_show(struct kmem_cache *s, char *buf)
3539 {
3540         return sprintf(buf, "%d\n", !!(s->flags & SLAB_STORE_USER));
3541 }
3542 SLAB_ATTR_RO(store_user);
3543
3544 static ssize_t hiwater_store(struct kmem_cache *s,
3545                                 const char *buf, size_t length)
3546 {
3547         long hiwater;
3548         int err;
3549
3550         err = strict_strtol(buf, 10, &hiwater);
3551         if (err)
3552                 return err;
3553
3554         if (hiwater < 0)
3555                 return -EINVAL;
3556
3557         s->hiwater = hiwater;
3558
3559         return length;
3560 }
3561
3562 static ssize_t hiwater_show(struct kmem_cache *s, char *buf)
3563 {
3564         return sprintf(buf, "%d\n", slab_hiwater(s));
3565 }
3566 SLAB_ATTR(hiwater);
3567
3568 static ssize_t freebatch_store(struct kmem_cache *s,
3569                                 const char *buf, size_t length)
3570 {
3571         long freebatch;
3572         int err;
3573
3574         err = strict_strtol(buf, 10, &freebatch);
3575         if (err)
3576                 return err;
3577
3578         if (freebatch <= 0 || freebatch - 1 > s->hiwater)
3579                 return -EINVAL;
3580
3581         s->freebatch = freebatch;
3582
3583         return length;
3584 }
3585
3586 static ssize_t freebatch_show(struct kmem_cache *s, char *buf)
3587 {
3588         return sprintf(buf, "%d\n", slab_freebatch(s));
3589 }
3590 SLAB_ATTR(freebatch);
3591
3592 #ifdef CONFIG_SLQB_STATS
3593 static int show_stat(struct kmem_cache *s, char *buf, enum stat_item si)
3594 {
3595         struct stats_gather stats;
3596         int len;
3597 #ifdef CONFIG_SMP
3598         int cpu;
3599 #endif
3600
3601         gather_stats(s, &stats);
3602
3603         len = sprintf(buf, "%lu", stats.stats[si]);
3604
3605 #ifdef CONFIG_SMP
3606         for_each_online_cpu(cpu) {
3607                 struct kmem_cache_cpu *c = get_cpu_slab(s, cpu);
3608                 struct kmem_cache_list *l = &c->list;
3609
3610                 if (len < PAGE_SIZE - 20)
3611                         len += sprintf(buf+len, " C%d=%lu", cpu, l->stats[si]);
3612         }
3613 #endif
3614         return len + sprintf(buf + len, "\n");
3615 }
3616
3617 #define STAT_ATTR(si, text)                                     \
3618 static ssize_t text##_show(struct kmem_cache *s, char *buf)     \
3619 {                                                               \
3620         return show_stat(s, buf, si);                           \
3621 }                                                               \
3622 SLAB_ATTR_RO(text);                                             \
3623
3624 STAT_ATTR(ALLOC, alloc);
3625 STAT_ATTR(ALLOC_SLAB_FILL, alloc_slab_fill);
3626 STAT_ATTR(ALLOC_SLAB_NEW, alloc_slab_new);
3627 STAT_ATTR(FREE, free);
3628 STAT_ATTR(FREE_REMOTE, free_remote);
3629 STAT_ATTR(FLUSH_FREE_LIST, flush_free_list);
3630 STAT_ATTR(FLUSH_FREE_LIST_OBJECTS, flush_free_list_objects);
3631 STAT_ATTR(FLUSH_FREE_LIST_REMOTE, flush_free_list_remote);
3632 STAT_ATTR(FLUSH_SLAB_PARTIAL, flush_slab_partial);
3633 STAT_ATTR(FLUSH_SLAB_FREE, flush_slab_free);
3634 STAT_ATTR(FLUSH_RFREE_LIST, flush_rfree_list);
3635 STAT_ATTR(FLUSH_RFREE_LIST_OBJECTS, flush_rfree_list_objects);
3636 STAT_ATTR(CLAIM_REMOTE_LIST, claim_remote_list);
3637 STAT_ATTR(CLAIM_REMOTE_LIST_OBJECTS, claim_remote_list_objects);
3638 #endif
3639
3640 static struct attribute *slab_attrs[] = {
3641         &slab_size_attr.attr,
3642         &object_size_attr.attr,
3643         &objs_per_slab_attr.attr,
3644         &order_attr.attr,
3645         &objects_attr.attr,
3646         &total_objects_attr.attr,
3647         &slabs_attr.attr,
3648         &ctor_attr.attr,
3649         &align_attr.attr,
3650         &hwcache_align_attr.attr,
3651         &reclaim_account_attr.attr,
3652         &destroy_by_rcu_attr.attr,
3653         &red_zone_attr.attr,
3654         &poison_attr.attr,
3655         &store_user_attr.attr,
3656         &hiwater_attr.attr,
3657         &freebatch_attr.attr,
3658 #ifdef CONFIG_ZONE_DMA
3659         &cache_dma_attr.attr,
3660 #endif
3661 #ifdef CONFIG_SLQB_STATS
3662         &alloc_attr.attr,
3663         &alloc_slab_fill_attr.attr,
3664         &alloc_slab_new_attr.attr,
3665         &free_attr.attr,
3666         &free_remote_attr.attr,
3667         &flush_free_list_attr.attr,
3668         &flush_free_list_objects_attr.attr,
3669         &flush_free_list_remote_attr.attr,
3670         &flush_slab_partial_attr.attr,
3671         &flush_slab_free_attr.attr,
3672         &flush_rfree_list_attr.attr,
3673         &flush_rfree_list_objects_attr.attr,
3674         &claim_remote_list_attr.attr,
3675         &claim_remote_list_objects_attr.attr,
3676 #endif
3677 #ifdef CONFIG_FAILSLAB
3678         &failslab_attr.attr,
3679 #endif
3680
3681         NULL
3682 };
3683
3684 static struct attribute_group slab_attr_group = {
3685         .attrs = slab_attrs,
3686 };
3687
3688 static ssize_t slab_attr_show(struct kobject *kobj,
3689                                 struct attribute *attr, char *buf)
3690 {
3691         struct slab_attribute *attribute;
3692         struct kmem_cache *s;
3693         int err;
3694
3695         attribute = to_slab_attr(attr);
3696         s = to_slab(kobj);
3697
3698         if (!attribute->show)
3699                 return -EIO;
3700
3701         err = attribute->show(s, buf);
3702
3703         return err;
3704 }
3705
3706 static ssize_t slab_attr_store(struct kobject *kobj,
3707                         struct attribute *attr, const char *buf, size_t len)
3708 {
3709         struct slab_attribute *attribute;
3710         struct kmem_cache *s;
3711         int err;
3712
3713         attribute = to_slab_attr(attr);
3714         s = to_slab(kobj);
3715
3716         if (!attribute->store)
3717                 return -EIO;
3718
3719         err = attribute->store(s, buf, len);
3720
3721         return err;
3722 }
3723
3724 static void kmem_cache_release(struct kobject *kobj)
3725 {
3726         struct kmem_cache *s = to_slab(kobj);
3727
3728         kmem_cache_free(&kmem_cache_cache, s);
3729 }
3730
3731 static struct sysfs_ops slab_sysfs_ops = {
3732         .show = slab_attr_show,
3733         .store = slab_attr_store,
3734 };
3735
3736 static struct kobj_type slab_ktype = {
3737         .sysfs_ops = &slab_sysfs_ops,
3738         .release = kmem_cache_release
3739 };
3740
3741 static int uevent_filter(struct kset *kset, struct kobject *kobj)
3742 {
3743         struct kobj_type *ktype = get_ktype(kobj);
3744
3745         if (ktype == &slab_ktype)
3746                 return 1;
3747         return 0;
3748 }
3749
3750 static struct kset_uevent_ops slab_uevent_ops = {
3751         .filter = uevent_filter,
3752 };
3753
3754 static struct kset *slab_kset;
3755
3756 static int sysfs_available __read_mostly;
3757
3758 static int sysfs_slab_add(struct kmem_cache *s)
3759 {
3760         int err;
3761
3762         if (!sysfs_available)
3763                 return 0;
3764
3765         s->kobj.kset = slab_kset;
3766         err = kobject_init_and_add(&s->kobj, &slab_ktype, NULL, s->name);
3767         if (err) {
3768                 kobject_put(&s->kobj);
3769                 return err;
3770         }
3771
3772         err = sysfs_create_group(&s->kobj, &slab_attr_group);
3773         if (err)
3774                 return err;
3775
3776         kobject_uevent(&s->kobj, KOBJ_ADD);
3777
3778         return 0;
3779 }
3780
3781 static void sysfs_slab_remove(struct kmem_cache *s)
3782 {
3783         kobject_uevent(&s->kobj, KOBJ_REMOVE);
3784         kobject_del(&s->kobj);
3785         kobject_put(&s->kobj);
3786 }
3787
3788 static int __init slab_sysfs_init(void)
3789 {
3790         struct kmem_cache *s;
3791         int err;
3792
3793         slab_kset = kset_create_and_add("slab", &slab_uevent_ops, kernel_kobj);
3794         if (!slab_kset) {
3795                 printk(KERN_ERR "Cannot register slab subsystem.\n");
3796                 return -ENOSYS;
3797         }
3798
3799         down_write(&slqb_lock);
3800
3801         sysfs_available = 1;
3802
3803         list_for_each_entry(s, &slab_caches, list) {
3804                 err = sysfs_slab_add(s);
3805                 if (err)
3806                         printk(KERN_ERR "SLQB: Unable to add boot slab %s"
3807                                                 " to sysfs\n", s->name);
3808         }
3809
3810         up_write(&slqb_lock);
3811
3812         return 0;
3813 }
3814 device_initcall(slab_sysfs_init);
3815
3816 #endif