On Tue, Nov 06, 2007 at 02:33:53AM -0800, akpm@linux-foundation.org wrote:
[mmotm.git] / mm / slqb.c
blobe745d9ac09567de1187150ad40b123755c0d4abc
1 /*
2 * SLQB: A slab allocator that focuses on per-CPU scaling, and good performance
3 * with order-0 allocations. Fastpaths emphasis is placed on local allocaiton
4 * and freeing, but with a secondary goal of good remote freeing (freeing on
5 * another CPU from that which allocated).
7 * Using ideas and code from mm/slab.c, mm/slob.c, and mm/slub.c.
8 */
10 #include <linux/mm.h>
11 #include <linux/swap.h> /* struct reclaim_state */
12 #include <linux/module.h>
13 #include <linux/interrupt.h>
14 #include <linux/slab.h>
15 #include <linux/seq_file.h>
16 #include <linux/cpu.h>
17 #include <linux/cpuset.h>
18 #include <linux/mempolicy.h>
19 #include <linux/ctype.h>
20 #include <linux/kallsyms.h>
21 #include <linux/memory.h>
22 #include <linux/fault-inject.h>
25 * TODO
26 * - fix up releasing of offlined data structures. Not a big deal because
27 * they don't get cumulatively leaked with successive online/offline cycles
28 * - allow OOM conditions to flush back per-CPU pages to common lists to be
29 * reused by other CPUs.
30 * - investiage performance with memoryless nodes. Perhaps CPUs can be given
31 * a default closest home node via which it can use fastpath functions.
32 * Perhaps it is not a big problem.
36 * slqb_page overloads struct page, and is used to manage some slob allocation
37 * aspects, however to avoid the horrible mess in include/linux/mm_types.h,
38 * we'll just define our own struct slqb_page type variant here.
40 struct slqb_page {
41 union {
42 struct {
43 unsigned long flags; /* mandatory */
44 atomic_t _count; /* mandatory */
45 unsigned int inuse; /* Nr of objects */
46 struct kmem_cache_list *list; /* Pointer to list */
47 void **freelist; /* LIFO freelist */
48 union {
49 struct list_head lru; /* misc. list */
50 struct rcu_head rcu_head; /* for rcu freeing */
53 struct page page;
56 static inline void struct_slqb_page_wrong_size(void)
57 { BUILD_BUG_ON(sizeof(struct slqb_page) != sizeof(struct page)); }
59 #define PG_SLQB_BIT (1 << PG_slab)
62 * slqb_min_order: minimum allocation order for slabs
64 static int slqb_min_order;
67 * slqb_min_objects: minimum number of objects per slab. Increasing this
68 * will increase the allocation order for slabs with larger objects
70 static int slqb_min_objects = 1;
72 #ifdef CONFIG_NUMA
73 static inline int slab_numa(struct kmem_cache *s)
75 return s->flags & SLAB_NUMA;
77 #else
78 static inline int slab_numa(struct kmem_cache *s)
80 return 0;
82 #endif
84 static inline int slab_hiwater(struct kmem_cache *s)
86 return s->hiwater;
89 static inline int slab_freebatch(struct kmem_cache *s)
91 return s->freebatch;
95 * Lock order:
96 * kmem_cache_node->list_lock
97 * kmem_cache_remote_free->lock
99 * Data structures:
100 * SLQB is primarily per-cpu. For each kmem_cache, each CPU has:
102 * - A LIFO list of node-local objects. Allocation and freeing of node local
103 * objects goes first to this list.
105 * - 2 Lists of slab pages, free and partial pages. If an allocation misses
106 * the object list, it tries from the partial list, then the free list.
107 * After freeing an object to the object list, if it is over a watermark,
108 * some objects are freed back to pages. If an allocation misses these lists,
109 * a new slab page is allocated from the page allocator. If the free list
110 * reaches a watermark, some of its pages are returned to the page allocator.
112 * - A remote free queue, where objects freed that did not come from the local
113 * node are queued to. When this reaches a watermark, the objects are
114 * flushed.
116 * - A remotely freed queue, where objects allocated from this CPU are flushed
117 * to from other CPUs' remote free queues. kmem_cache_remote_free->lock is
118 * used to protect access to this queue.
120 * When the remotely freed queue reaches a watermark, a flag is set to tell
121 * the owner CPU to check it. The owner CPU will then check the queue on the
122 * next allocation that misses the object list. It will move all objects from
123 * this list onto the object list and then allocate one.
125 * This system of remote queueing is intended to reduce lock and remote
126 * cacheline acquisitions, and give a cooling off period for remotely freed
127 * objects before they are re-allocated.
129 * node specific allocations from somewhere other than the local node are
130 * handled by a per-node list which is the same as the above per-CPU data
131 * structures except for the following differences:
133 * - kmem_cache_node->list_lock is used to protect access for multiple CPUs to
134 * allocate from a given node.
136 * - There is no remote free queue. Nodes don't free objects, CPUs do.
139 static inline void slqb_stat_inc(struct kmem_cache_list *list,
140 enum stat_item si)
142 #ifdef CONFIG_SLQB_STATS
143 list->stats[si]++;
144 #endif
147 static inline void slqb_stat_add(struct kmem_cache_list *list,
148 enum stat_item si, unsigned long nr)
150 #ifdef CONFIG_SLQB_STATS
151 list->stats[si] += nr;
152 #endif
155 static inline int slqb_page_to_nid(struct slqb_page *page)
157 return page_to_nid(&page->page);
160 static inline void *slqb_page_address(struct slqb_page *page)
162 return page_address(&page->page);
165 static inline struct zone *slqb_page_zone(struct slqb_page *page)
167 return page_zone(&page->page);
170 static inline int virt_to_nid(const void *addr)
172 return page_to_nid(virt_to_page(addr));
175 static inline struct slqb_page *virt_to_head_slqb_page(const void *addr)
177 struct page *p;
179 p = virt_to_head_page(addr);
180 return (struct slqb_page *)p;
183 static inline void __free_slqb_pages(struct slqb_page *page, unsigned int order,
184 int pages)
186 struct page *p = &page->page;
188 reset_page_mapcount(p);
189 p->mapping = NULL;
190 VM_BUG_ON(!(p->flags & PG_SLQB_BIT));
191 p->flags &= ~PG_SLQB_BIT;
193 if (current->reclaim_state)
194 current->reclaim_state->reclaimed_slab += pages;
195 __free_pages(p, order);
198 #ifdef CONFIG_SLQB_DEBUG
199 static inline int slab_debug(struct kmem_cache *s)
201 return s->flags &
202 (SLAB_DEBUG_FREE |
203 SLAB_RED_ZONE |
204 SLAB_POISON |
205 SLAB_STORE_USER |
206 SLAB_TRACE);
208 static inline int slab_poison(struct kmem_cache *s)
210 return s->flags & SLAB_POISON;
212 #else
213 static inline int slab_debug(struct kmem_cache *s)
215 return 0;
217 static inline int slab_poison(struct kmem_cache *s)
219 return 0;
221 #endif
223 #define DEBUG_DEFAULT_FLAGS (SLAB_DEBUG_FREE | SLAB_RED_ZONE | \
224 SLAB_POISON | SLAB_STORE_USER)
226 /* Internal SLQB flags */
227 #define __OBJECT_POISON 0x80000000 /* Poison object */
229 /* Not all arches define cache_line_size */
230 #ifndef cache_line_size
231 #define cache_line_size() L1_CACHE_BYTES
232 #endif
234 #ifdef CONFIG_SMP
235 static struct notifier_block slab_notifier;
236 #endif
239 * slqb_lock protects slab_caches list and serialises hotplug operations.
240 * hotplug operations take lock for write, other operations can hold off
241 * hotplug by taking it for read (or write).
243 static DECLARE_RWSEM(slqb_lock);
246 * A list of all slab caches on the system
248 static LIST_HEAD(slab_caches);
251 * Tracking user of a slab.
253 struct track {
254 unsigned long addr; /* Called from address */
255 int cpu; /* Was running on cpu */
256 int pid; /* Pid context */
257 unsigned long when; /* When did the operation occur */
260 enum track_item { TRACK_ALLOC, TRACK_FREE };
262 static struct kmem_cache kmem_cache_cache;
264 #ifdef CONFIG_SLQB_SYSFS
265 static int sysfs_slab_add(struct kmem_cache *s);
266 static void sysfs_slab_remove(struct kmem_cache *s);
267 #else
268 static inline int sysfs_slab_add(struct kmem_cache *s)
270 return 0;
272 static inline void sysfs_slab_remove(struct kmem_cache *s)
274 kmem_cache_free(&kmem_cache_cache, s);
276 #endif
278 /********************************************************************
279 * Core slab cache functions
280 *******************************************************************/
282 static int __slab_is_available __read_mostly;
283 int slab_is_available(void)
285 return __slab_is_available;
288 static inline struct kmem_cache_cpu *get_cpu_slab(struct kmem_cache *s, int cpu)
290 #ifdef CONFIG_SMP
291 VM_BUG_ON(!s->cpu_slab[cpu]);
292 return s->cpu_slab[cpu];
293 #else
294 return &s->cpu_slab;
295 #endif
298 static inline int check_valid_pointer(struct kmem_cache *s,
299 struct slqb_page *page, const void *object)
301 void *base;
303 base = slqb_page_address(page);
304 if (object < base || object >= base + s->objects * s->size ||
305 (object - base) % s->size) {
306 return 0;
309 return 1;
312 static inline void *get_freepointer(struct kmem_cache *s, void *object)
314 return *(void **)(object + s->offset);
317 static inline void set_freepointer(struct kmem_cache *s, void *object, void *fp)
319 *(void **)(object + s->offset) = fp;
322 /* Loop over all objects in a slab */
323 #define for_each_object(__p, __s, __addr) \
324 for (__p = (__addr); __p < (__addr) + (__s)->objects * (__s)->size;\
325 __p += (__s)->size)
327 /* Scan freelist */
328 #define for_each_free_object(__p, __s, __free) \
329 for (__p = (__free); (__p) != NULL; __p = get_freepointer((__s),\
330 __p))
332 #ifdef CONFIG_SLQB_DEBUG
334 * Debug settings:
336 #ifdef CONFIG_SLQB_DEBUG_ON
337 static int slqb_debug __read_mostly = DEBUG_DEFAULT_FLAGS;
338 #else
339 static int slqb_debug __read_mostly;
340 #endif
342 static char *slqb_debug_slabs;
345 * Object debugging
347 static void print_section(char *text, u8 *addr, unsigned int length)
349 int i, offset;
350 int newline = 1;
351 char ascii[17];
353 ascii[16] = 0;
355 for (i = 0; i < length; i++) {
356 if (newline) {
357 printk(KERN_ERR "%8s 0x%p: ", text, addr + i);
358 newline = 0;
360 printk(KERN_CONT " %02x", addr[i]);
361 offset = i % 16;
362 ascii[offset] = isgraph(addr[i]) ? addr[i] : '.';
363 if (offset == 15) {
364 printk(KERN_CONT " %s\n", ascii);
365 newline = 1;
368 if (!newline) {
369 i %= 16;
370 while (i < 16) {
371 printk(KERN_CONT " ");
372 ascii[i] = ' ';
373 i++;
375 printk(KERN_CONT " %s\n", ascii);
379 static struct track *get_track(struct kmem_cache *s, void *object,
380 enum track_item alloc)
382 struct track *p;
384 if (s->offset)
385 p = object + s->offset + sizeof(void *);
386 else
387 p = object + s->inuse;
389 return p + alloc;
392 static void set_track(struct kmem_cache *s, void *object,
393 enum track_item alloc, unsigned long addr)
395 struct track *p;
397 if (s->offset)
398 p = object + s->offset + sizeof(void *);
399 else
400 p = object + s->inuse;
402 p += alloc;
403 if (addr) {
404 p->addr = addr;
405 p->cpu = raw_smp_processor_id();
406 p->pid = current ? current->pid : -1;
407 p->when = jiffies;
408 } else
409 memset(p, 0, sizeof(struct track));
412 static void init_tracking(struct kmem_cache *s, void *object)
414 if (!(s->flags & SLAB_STORE_USER))
415 return;
417 set_track(s, object, TRACK_FREE, 0UL);
418 set_track(s, object, TRACK_ALLOC, 0UL);
421 static void print_track(const char *s, struct track *t)
423 if (!t->addr)
424 return;
426 printk(KERN_ERR "INFO: %s in ", s);
427 __print_symbol("%s", (unsigned long)t->addr);
428 printk(" age=%lu cpu=%u pid=%d\n", jiffies - t->when, t->cpu, t->pid);
431 static void print_tracking(struct kmem_cache *s, void *object)
433 if (!(s->flags & SLAB_STORE_USER))
434 return;
436 print_track("Allocated", get_track(s, object, TRACK_ALLOC));
437 print_track("Freed", get_track(s, object, TRACK_FREE));
440 static void print_page_info(struct slqb_page *page)
442 printk(KERN_ERR "INFO: Slab 0x%p used=%u fp=0x%p flags=0x%04lx\n",
443 page, page->inuse, page->freelist, page->flags);
447 #define MAX_ERR_STR 100
448 static void slab_bug(struct kmem_cache *s, char *fmt, ...)
450 va_list args;
451 char buf[MAX_ERR_STR];
453 va_start(args, fmt);
454 vsnprintf(buf, sizeof(buf), fmt, args);
455 va_end(args);
456 printk(KERN_ERR "========================================"
457 "=====================================\n");
458 printk(KERN_ERR "BUG %s: %s\n", s->name, buf);
459 printk(KERN_ERR "----------------------------------------"
460 "-------------------------------------\n\n");
463 static void slab_fix(struct kmem_cache *s, char *fmt, ...)
465 va_list args;
466 char buf[100];
468 va_start(args, fmt);
469 vsnprintf(buf, sizeof(buf), fmt, args);
470 va_end(args);
471 printk(KERN_ERR "FIX %s: %s\n", s->name, buf);
474 static void print_trailer(struct kmem_cache *s, struct slqb_page *page, u8 *p)
476 unsigned int off; /* Offset of last byte */
477 u8 *addr = slqb_page_address(page);
479 print_tracking(s, p);
481 print_page_info(page);
483 printk(KERN_ERR "INFO: Object 0x%p @offset=%tu fp=0x%p\n\n",
484 p, p - addr, get_freepointer(s, p));
486 if (p > addr + 16)
487 print_section("Bytes b4", p - 16, 16);
489 print_section("Object", p, min(s->objsize, 128));
491 if (s->flags & SLAB_RED_ZONE)
492 print_section("Redzone", p + s->objsize, s->inuse - s->objsize);
494 if (s->offset)
495 off = s->offset + sizeof(void *);
496 else
497 off = s->inuse;
499 if (s->flags & SLAB_STORE_USER)
500 off += 2 * sizeof(struct track);
502 if (off != s->size) {
503 /* Beginning of the filler is the free pointer */
504 print_section("Padding", p + off, s->size - off);
507 dump_stack();
510 static void object_err(struct kmem_cache *s, struct slqb_page *page,
511 u8 *object, char *reason)
513 slab_bug(s, reason);
514 print_trailer(s, page, object);
517 static void slab_err(struct kmem_cache *s, struct slqb_page *page,
518 char *fmt, ...)
520 slab_bug(s, fmt);
521 print_page_info(page);
522 dump_stack();
525 static void init_object(struct kmem_cache *s, void *object, int active)
527 u8 *p = object;
529 if (s->flags & __OBJECT_POISON) {
530 memset(p, POISON_FREE, s->objsize - 1);
531 p[s->objsize - 1] = POISON_END;
534 if (s->flags & SLAB_RED_ZONE) {
535 memset(p + s->objsize,
536 active ? SLUB_RED_ACTIVE : SLUB_RED_INACTIVE,
537 s->inuse - s->objsize);
541 static u8 *check_bytes(u8 *start, unsigned int value, unsigned int bytes)
543 while (bytes) {
544 if (*start != (u8)value)
545 return start;
546 start++;
547 bytes--;
549 return NULL;
552 static void restore_bytes(struct kmem_cache *s, char *message, u8 data,
553 void *from, void *to)
555 slab_fix(s, "Restoring 0x%p-0x%p=0x%x\n", from, to - 1, data);
556 memset(from, data, to - from);
559 static int check_bytes_and_report(struct kmem_cache *s, struct slqb_page *page,
560 u8 *object, char *what,
561 u8 *start, unsigned int value, unsigned int bytes)
563 u8 *fault;
564 u8 *end;
566 fault = check_bytes(start, value, bytes);
567 if (!fault)
568 return 1;
570 end = start + bytes;
571 while (end > fault && end[-1] == value)
572 end--;
574 slab_bug(s, "%s overwritten", what);
575 printk(KERN_ERR "INFO: 0x%p-0x%p. First byte 0x%x instead of 0x%x\n",
576 fault, end - 1, fault[0], value);
577 print_trailer(s, page, object);
579 restore_bytes(s, what, value, fault, end);
580 return 0;
584 * Object layout:
586 * object address
587 * Bytes of the object to be managed.
588 * If the freepointer may overlay the object then the free
589 * pointer is the first word of the object.
591 * Poisoning uses 0x6b (POISON_FREE) and the last byte is
592 * 0xa5 (POISON_END)
594 * object + s->objsize
595 * Padding to reach word boundary. This is also used for Redzoning.
596 * Padding is extended by another word if Redzoning is enabled and
597 * objsize == inuse.
599 * We fill with 0xbb (RED_INACTIVE) for inactive objects and with
600 * 0xcc (RED_ACTIVE) for objects in use.
602 * object + s->inuse
603 * Meta data starts here.
605 * A. Free pointer (if we cannot overwrite object on free)
606 * B. Tracking data for SLAB_STORE_USER
607 * C. Padding to reach required alignment boundary or at mininum
608 * one word if debuggin is on to be able to detect writes
609 * before the word boundary.
611 * Padding is done using 0x5a (POISON_INUSE)
613 * object + s->size
614 * Nothing is used beyond s->size.
617 static int check_pad_bytes(struct kmem_cache *s, struct slqb_page *page, u8 *p)
619 unsigned long off = s->inuse; /* The end of info */
621 if (s->offset) {
622 /* Freepointer is placed after the object. */
623 off += sizeof(void *);
626 if (s->flags & SLAB_STORE_USER) {
627 /* We also have user information there */
628 off += 2 * sizeof(struct track);
631 if (s->size == off)
632 return 1;
634 return check_bytes_and_report(s, page, p, "Object padding",
635 p + off, POISON_INUSE, s->size - off);
638 static int slab_pad_check(struct kmem_cache *s, struct slqb_page *page)
640 u8 *start;
641 u8 *fault;
642 u8 *end;
643 int length;
644 int remainder;
646 if (!(s->flags & SLAB_POISON))
647 return 1;
649 start = slqb_page_address(page);
650 end = start + (PAGE_SIZE << s->order);
651 length = s->objects * s->size;
652 remainder = end - (start + length);
653 if (!remainder)
654 return 1;
656 fault = check_bytes(start + length, POISON_INUSE, remainder);
657 if (!fault)
658 return 1;
660 while (end > fault && end[-1] == POISON_INUSE)
661 end--;
663 slab_err(s, page, "Padding overwritten. 0x%p-0x%p", fault, end - 1);
664 print_section("Padding", start, length);
666 restore_bytes(s, "slab padding", POISON_INUSE, start, end);
667 return 0;
670 static int check_object(struct kmem_cache *s, struct slqb_page *page,
671 void *object, int active)
673 u8 *p = object;
674 u8 *endobject = object + s->objsize;
676 if (s->flags & SLAB_RED_ZONE) {
677 unsigned int red =
678 active ? SLUB_RED_ACTIVE : SLUB_RED_INACTIVE;
680 if (!check_bytes_and_report(s, page, object, "Redzone",
681 endobject, red, s->inuse - s->objsize))
682 return 0;
683 } else {
684 if ((s->flags & SLAB_POISON) && s->objsize < s->inuse) {
685 check_bytes_and_report(s, page, p, "Alignment padding",
686 endobject, POISON_INUSE, s->inuse - s->objsize);
690 if (s->flags & SLAB_POISON) {
691 if (!active && (s->flags & __OBJECT_POISON)) {
692 if (!check_bytes_and_report(s, page, p, "Poison", p,
693 POISON_FREE, s->objsize - 1))
694 return 0;
696 if (!check_bytes_and_report(s, page, p, "Poison",
697 p + s->objsize - 1, POISON_END, 1))
698 return 0;
702 * check_pad_bytes cleans up on its own.
704 check_pad_bytes(s, page, p);
707 return 1;
710 static int check_slab(struct kmem_cache *s, struct slqb_page *page)
712 if (!(page->flags & PG_SLQB_BIT)) {
713 slab_err(s, page, "Not a valid slab page");
714 return 0;
716 if (page->inuse == 0) {
717 slab_err(s, page, "inuse before free / after alloc", s->name);
718 return 0;
720 if (page->inuse > s->objects) {
721 slab_err(s, page, "inuse %u > max %u",
722 s->name, page->inuse, s->objects);
723 return 0;
725 /* Slab_pad_check fixes things up after itself */
726 slab_pad_check(s, page);
727 return 1;
730 static void trace(struct kmem_cache *s, struct slqb_page *page,
731 void *object, int alloc)
733 if (s->flags & SLAB_TRACE) {
734 printk(KERN_INFO "TRACE %s %s 0x%p inuse=%d fp=0x%p\n",
735 s->name,
736 alloc ? "alloc" : "free",
737 object, page->inuse,
738 page->freelist);
740 if (!alloc)
741 print_section("Object", (void *)object, s->objsize);
743 dump_stack();
747 static void setup_object_debug(struct kmem_cache *s, struct slqb_page *page,
748 void *object)
750 if (!slab_debug(s))
751 return;
753 if (!(s->flags & (SLAB_STORE_USER|SLAB_RED_ZONE|__OBJECT_POISON)))
754 return;
756 init_object(s, object, 0);
757 init_tracking(s, object);
760 static int alloc_debug_processing(struct kmem_cache *s,
761 void *object, unsigned long addr)
763 struct slqb_page *page;
764 page = virt_to_head_slqb_page(object);
766 if (!check_slab(s, page))
767 goto bad;
769 if (!check_valid_pointer(s, page, object)) {
770 object_err(s, page, object, "Freelist Pointer check fails");
771 goto bad;
774 if (object && !check_object(s, page, object, 0))
775 goto bad;
777 /* Success perform special debug activities for allocs */
778 if (s->flags & SLAB_STORE_USER)
779 set_track(s, object, TRACK_ALLOC, addr);
780 trace(s, page, object, 1);
781 init_object(s, object, 1);
782 return 1;
784 bad:
785 return 0;
788 static int free_debug_processing(struct kmem_cache *s,
789 void *object, unsigned long addr)
791 struct slqb_page *page;
792 page = virt_to_head_slqb_page(object);
794 if (!check_slab(s, page))
795 goto fail;
797 if (!check_valid_pointer(s, page, object)) {
798 slab_err(s, page, "Invalid object pointer 0x%p", object);
799 goto fail;
802 if (!check_object(s, page, object, 1))
803 return 0;
805 /* Special debug activities for freeing objects */
806 if (s->flags & SLAB_STORE_USER)
807 set_track(s, object, TRACK_FREE, addr);
808 trace(s, page, object, 0);
809 init_object(s, object, 0);
810 return 1;
812 fail:
813 slab_fix(s, "Object at 0x%p not freed", object);
814 return 0;
817 static int __init setup_slqb_debug(char *str)
819 slqb_debug = DEBUG_DEFAULT_FLAGS;
820 if (*str++ != '=' || !*str) {
822 * No options specified. Switch on full debugging.
824 goto out;
827 if (*str == ',') {
829 * No options but restriction on slabs. This means full
830 * debugging for slabs matching a pattern.
832 goto check_slabs;
835 slqb_debug = 0;
836 if (*str == '-') {
838 * Switch off all debugging measures.
840 goto out;
844 * Determine which debug features should be switched on
846 for (; *str && *str != ','; str++) {
847 switch (tolower(*str)) {
848 case 'f':
849 slqb_debug |= SLAB_DEBUG_FREE;
850 break;
851 case 'z':
852 slqb_debug |= SLAB_RED_ZONE;
853 break;
854 case 'p':
855 slqb_debug |= SLAB_POISON;
856 break;
857 case 'u':
858 slqb_debug |= SLAB_STORE_USER;
859 break;
860 case 't':
861 slqb_debug |= SLAB_TRACE;
862 break;
863 default:
864 printk(KERN_ERR "slqb_debug option '%c' "
865 "unknown. skipped\n", *str);
869 check_slabs:
870 if (*str == ',')
871 slqb_debug_slabs = str + 1;
872 out:
873 return 1;
875 __setup("slqb_debug", setup_slqb_debug);
877 static int __init setup_slqb_min_order(char *str)
879 get_option(&str, &slqb_min_order);
880 slqb_min_order = min(slqb_min_order, MAX_ORDER - 1);
882 return 1;
884 __setup("slqb_min_order=", setup_slqb_min_order);
886 static int __init setup_slqb_min_objects(char *str)
888 get_option(&str, &slqb_min_objects);
890 return 1;
893 __setup("slqb_min_objects=", setup_slqb_min_objects);
895 static unsigned long kmem_cache_flags(unsigned long objsize,
896 unsigned long flags, const char *name,
897 void (*ctor)(void *))
900 * Enable debugging if selected on the kernel commandline.
902 if (slqb_debug && (!slqb_debug_slabs ||
903 strncmp(slqb_debug_slabs, name,
904 strlen(slqb_debug_slabs)) == 0))
905 flags |= slqb_debug;
907 if (num_possible_nodes() > 1)
908 flags |= SLAB_NUMA;
910 return flags;
912 #else
913 static inline void setup_object_debug(struct kmem_cache *s,
914 struct slqb_page *page, void *object)
918 static inline int alloc_debug_processing(struct kmem_cache *s,
919 void *object, unsigned long addr)
921 return 0;
924 static inline int free_debug_processing(struct kmem_cache *s,
925 void *object, unsigned long addr)
927 return 0;
930 static inline int slab_pad_check(struct kmem_cache *s, struct slqb_page *page)
932 return 1;
935 static inline int check_object(struct kmem_cache *s, struct slqb_page *page,
936 void *object, int active)
938 return 1;
941 static inline void add_full(struct kmem_cache_node *n, struct slqb_page *page)
945 static inline unsigned long kmem_cache_flags(unsigned long objsize,
946 unsigned long flags, const char *name, void (*ctor)(void *))
948 if (num_possible_nodes() > 1)
949 flags |= SLAB_NUMA;
950 return flags;
953 static const int slqb_debug;
954 #endif
957 * allocate a new slab (return its corresponding struct slqb_page)
959 static struct slqb_page *allocate_slab(struct kmem_cache *s,
960 gfp_t flags, int node)
962 struct slqb_page *page;
963 int pages = 1 << s->order;
965 flags |= s->allocflags;
967 page = (struct slqb_page *)alloc_pages_node(node, flags, s->order);
968 if (!page)
969 return NULL;
971 mod_zone_page_state(slqb_page_zone(page),
972 (s->flags & SLAB_RECLAIM_ACCOUNT) ?
973 NR_SLAB_RECLAIMABLE : NR_SLAB_UNRECLAIMABLE,
974 pages);
976 return page;
980 * Called once for each object on a new slab page
982 static void setup_object(struct kmem_cache *s,
983 struct slqb_page *page, void *object)
985 setup_object_debug(s, page, object);
986 if (unlikely(s->ctor))
987 s->ctor(object);
991 * Allocate a new slab, set up its object list.
993 static struct slqb_page *new_slab_page(struct kmem_cache *s,
994 gfp_t flags, int node, unsigned int colour)
996 struct slqb_page *page;
997 void *start;
998 void *last;
999 void *p;
1001 BUG_ON(flags & GFP_SLAB_BUG_MASK);
1003 page = allocate_slab(s,
1004 flags & (GFP_RECLAIM_MASK | GFP_CONSTRAINT_MASK), node);
1005 if (!page)
1006 goto out;
1008 page->flags |= PG_SLQB_BIT;
1010 start = page_address(&page->page);
1012 if (unlikely(slab_poison(s)))
1013 memset(start, POISON_INUSE, PAGE_SIZE << s->order);
1015 start += colour;
1017 last = start;
1018 for_each_object(p, s, start) {
1019 setup_object(s, page, p);
1020 set_freepointer(s, last, p);
1021 last = p;
1023 set_freepointer(s, last, NULL);
1025 page->freelist = start;
1026 page->inuse = 0;
1027 out:
1028 return page;
1032 * Free a slab page back to the page allocator
1034 static void __free_slab(struct kmem_cache *s, struct slqb_page *page)
1036 int pages = 1 << s->order;
1038 if (unlikely(slab_debug(s))) {
1039 void *p;
1041 slab_pad_check(s, page);
1042 for_each_free_object(p, s, page->freelist)
1043 check_object(s, page, p, 0);
1046 mod_zone_page_state(slqb_page_zone(page),
1047 (s->flags & SLAB_RECLAIM_ACCOUNT) ?
1048 NR_SLAB_RECLAIMABLE : NR_SLAB_UNRECLAIMABLE,
1049 -pages);
1051 __free_slqb_pages(page, s->order, pages);
1054 static void rcu_free_slab(struct rcu_head *h)
1056 struct slqb_page *page;
1058 page = container_of(h, struct slqb_page, rcu_head);
1059 __free_slab(page->list->cache, page);
1062 static void free_slab(struct kmem_cache *s, struct slqb_page *page)
1064 VM_BUG_ON(page->inuse);
1065 if (unlikely(s->flags & SLAB_DESTROY_BY_RCU))
1066 call_rcu(&page->rcu_head, rcu_free_slab);
1067 else
1068 __free_slab(s, page);
1072 * Return an object to its slab.
1074 * Caller must be the owner CPU in the case of per-CPU list, or hold the node's
1075 * list_lock in the case of per-node list.
1077 static int free_object_to_page(struct kmem_cache *s,
1078 struct kmem_cache_list *l, struct slqb_page *page,
1079 void *object)
1081 VM_BUG_ON(page->list != l);
1083 set_freepointer(s, object, page->freelist);
1084 page->freelist = object;
1085 page->inuse--;
1087 if (!page->inuse) {
1088 if (likely(s->objects > 1)) {
1089 l->nr_partial--;
1090 list_del(&page->lru);
1092 l->nr_slabs--;
1093 free_slab(s, page);
1094 slqb_stat_inc(l, FLUSH_SLAB_FREE);
1095 return 1;
1097 } else if (page->inuse + 1 == s->objects) {
1098 l->nr_partial++;
1099 list_add(&page->lru, &l->partial);
1100 slqb_stat_inc(l, FLUSH_SLAB_PARTIAL);
1101 return 0;
1103 return 0;
1106 #ifdef CONFIG_SMP
1107 static void slab_free_to_remote(struct kmem_cache *s, struct slqb_page *page,
1108 void *object, struct kmem_cache_cpu *c);
1109 #endif
1112 * Flush the LIFO list of objects on a list. They are sent back to their pages
1113 * in case the pages also belong to the list, or to our CPU's remote-free list
1114 * in the case they do not.
1116 * Doesn't flush the entire list. flush_free_list_all does.
1118 * Caller must be the owner CPU in the case of per-CPU list, or hold the node's
1119 * list_lock in the case of per-node list.
1121 static void flush_free_list(struct kmem_cache *s, struct kmem_cache_list *l)
1123 void **head;
1124 int nr;
1125 int locked = 0;
1127 nr = l->freelist.nr;
1128 if (unlikely(!nr))
1129 return;
1131 nr = min(slab_freebatch(s), nr);
1133 slqb_stat_inc(l, FLUSH_FREE_LIST);
1134 slqb_stat_add(l, FLUSH_FREE_LIST_OBJECTS, nr);
1136 l->freelist.nr -= nr;
1137 head = l->freelist.head;
1139 do {
1140 struct slqb_page *page;
1141 void **object;
1143 object = head;
1144 VM_BUG_ON(!object);
1145 head = get_freepointer(s, object);
1146 page = virt_to_head_slqb_page(object);
1148 #ifdef CONFIG_SMP
1149 if (page->list != l) {
1150 struct kmem_cache_cpu *c;
1152 if (locked) {
1153 spin_unlock(&l->page_lock);
1154 locked = 0;
1157 c = get_cpu_slab(s, smp_processor_id());
1159 slab_free_to_remote(s, page, object, c);
1160 slqb_stat_inc(l, FLUSH_FREE_LIST_REMOTE);
1161 } else
1162 #endif
1164 if (!locked) {
1165 spin_lock(&l->page_lock);
1166 locked = 1;
1168 free_object_to_page(s, l, page, object);
1171 nr--;
1172 } while (nr);
1174 if (locked)
1175 spin_unlock(&l->page_lock);
1177 l->freelist.head = head;
1178 if (!l->freelist.nr)
1179 l->freelist.tail = NULL;
1182 static void flush_free_list_all(struct kmem_cache *s, struct kmem_cache_list *l)
1184 while (l->freelist.nr)
1185 flush_free_list(s, l);
1188 #ifdef CONFIG_SMP
1190 * If enough objects have been remotely freed back to this list,
1191 * remote_free_check will be set. In which case, we'll eventually come here
1192 * to take those objects off our remote_free list and onto our LIFO freelist.
1194 * Caller must be the owner CPU in the case of per-CPU list, or hold the node's
1195 * list_lock in the case of per-node list.
1197 static void claim_remote_free_list(struct kmem_cache *s,
1198 struct kmem_cache_list *l)
1200 void **head, **tail;
1201 int nr;
1203 if (!l->remote_free.list.nr)
1204 return;
1206 spin_lock(&l->remote_free.lock);
1208 l->remote_free_check = 0;
1209 head = l->remote_free.list.head;
1210 l->remote_free.list.head = NULL;
1211 tail = l->remote_free.list.tail;
1212 l->remote_free.list.tail = NULL;
1213 nr = l->remote_free.list.nr;
1214 l->remote_free.list.nr = 0;
1216 spin_unlock(&l->remote_free.lock);
1218 VM_BUG_ON(!nr);
1220 if (!l->freelist.nr) {
1221 /* Get head hot for likely subsequent allocation or flush */
1222 prefetchw(head);
1223 l->freelist.head = head;
1224 } else
1225 set_freepointer(s, l->freelist.tail, head);
1226 l->freelist.tail = tail;
1228 l->freelist.nr += nr;
1230 slqb_stat_inc(l, CLAIM_REMOTE_LIST);
1231 slqb_stat_add(l, CLAIM_REMOTE_LIST_OBJECTS, nr);
1233 #else
1234 static inline void claim_remote_free_list(struct kmem_cache *s,
1235 struct kmem_cache_list *l)
1238 #endif
1241 * Allocation fastpath. Get an object from the list's LIFO freelist, or
1242 * return NULL if it is empty.
1244 * Caller must be the owner CPU in the case of per-CPU list, or hold the node's
1245 * list_lock in the case of per-node list.
1247 static __always_inline void *__cache_list_get_object(struct kmem_cache *s,
1248 struct kmem_cache_list *l)
1250 void *object;
1252 object = l->freelist.head;
1253 if (likely(object)) {
1254 void *next = get_freepointer(s, object);
1256 VM_BUG_ON(!l->freelist.nr);
1257 l->freelist.nr--;
1258 l->freelist.head = next;
1260 return object;
1262 VM_BUG_ON(l->freelist.nr);
1264 #ifdef CONFIG_SMP
1265 if (unlikely(l->remote_free_check)) {
1266 claim_remote_free_list(s, l);
1268 if (l->freelist.nr > slab_hiwater(s))
1269 flush_free_list(s, l);
1271 /* repetition here helps gcc :( */
1272 object = l->freelist.head;
1273 if (likely(object)) {
1274 void *next = get_freepointer(s, object);
1276 VM_BUG_ON(!l->freelist.nr);
1277 l->freelist.nr--;
1278 l->freelist.head = next;
1280 return object;
1282 VM_BUG_ON(l->freelist.nr);
1284 #endif
1286 return NULL;
1290 * Slow(er) path. Get a page from this list's existing pages. Will be a
1291 * new empty page in the case that __slab_alloc_page has just been called
1292 * (empty pages otherwise never get queued up on the lists), or a partial page
1293 * already on the list.
1295 * Caller must be the owner CPU in the case of per-CPU list, or hold the node's
1296 * list_lock in the case of per-node list.
1298 static noinline void *__cache_list_get_page(struct kmem_cache *s,
1299 struct kmem_cache_list *l)
1301 struct slqb_page *page;
1302 void *object;
1304 if (unlikely(!l->nr_partial))
1305 return NULL;
1307 page = list_first_entry(&l->partial, struct slqb_page, lru);
1308 VM_BUG_ON(page->inuse == s->objects);
1309 if (page->inuse + 1 == s->objects) {
1310 l->nr_partial--;
1311 list_del(&page->lru);
1314 VM_BUG_ON(!page->freelist);
1316 page->inuse++;
1318 object = page->freelist;
1319 page->freelist = get_freepointer(s, object);
1320 if (page->freelist)
1321 prefetchw(page->freelist);
1322 VM_BUG_ON((page->inuse == s->objects) != (page->freelist == NULL));
1323 slqb_stat_inc(l, ALLOC_SLAB_FILL);
1325 return object;
1328 static void *cache_list_get_page(struct kmem_cache *s,
1329 struct kmem_cache_list *l)
1331 void *object;
1333 if (unlikely(!l->nr_partial))
1334 return NULL;
1336 spin_lock(&l->page_lock);
1337 object = __cache_list_get_page(s, l);
1338 spin_unlock(&l->page_lock);
1340 return object;
1344 * Allocation slowpath. Allocate a new slab page from the page allocator, and
1345 * put it on the list's partial list. Must be followed by an allocation so
1346 * that we don't have dangling empty pages on the partial list.
1348 * Returns 0 on allocation failure.
1350 * Must be called with interrupts disabled.
1352 static noinline void *__slab_alloc_page(struct kmem_cache *s,
1353 gfp_t gfpflags, int node)
1355 struct slqb_page *page;
1356 struct kmem_cache_list *l;
1357 struct kmem_cache_cpu *c;
1358 unsigned int colour;
1359 void *object;
1361 c = get_cpu_slab(s, smp_processor_id());
1362 colour = c->colour_next;
1363 c->colour_next += s->colour_off;
1364 if (c->colour_next >= s->colour_range)
1365 c->colour_next = 0;
1367 /* Caller handles __GFP_ZERO */
1368 gfpflags &= ~__GFP_ZERO;
1370 if (gfpflags & __GFP_WAIT)
1371 local_irq_enable();
1372 page = new_slab_page(s, gfpflags, node, colour);
1373 if (gfpflags & __GFP_WAIT)
1374 local_irq_disable();
1375 if (unlikely(!page))
1376 return page;
1378 if (!NUMA_BUILD || likely(slqb_page_to_nid(page) == numa_node_id())) {
1379 struct kmem_cache_cpu *c;
1380 int cpu = smp_processor_id();
1382 c = get_cpu_slab(s, cpu);
1383 l = &c->list;
1384 page->list = l;
1386 spin_lock(&l->page_lock);
1387 l->nr_slabs++;
1388 l->nr_partial++;
1389 list_add(&page->lru, &l->partial);
1390 slqb_stat_inc(l, ALLOC);
1391 slqb_stat_inc(l, ALLOC_SLAB_NEW);
1392 object = __cache_list_get_page(s, l);
1393 spin_unlock(&l->page_lock);
1394 } else {
1395 #ifdef CONFIG_NUMA
1396 struct kmem_cache_node *n;
1398 n = s->node_slab[slqb_page_to_nid(page)];
1399 l = &n->list;
1400 page->list = l;
1402 spin_lock(&n->list_lock);
1403 spin_lock(&l->page_lock);
1404 l->nr_slabs++;
1405 l->nr_partial++;
1406 list_add(&page->lru, &l->partial);
1407 slqb_stat_inc(l, ALLOC);
1408 slqb_stat_inc(l, ALLOC_SLAB_NEW);
1409 object = __cache_list_get_page(s, l);
1410 spin_unlock(&l->page_lock);
1411 spin_unlock(&n->list_lock);
1412 #endif
1414 VM_BUG_ON(!object);
1415 return object;
1418 #ifdef CONFIG_NUMA
1419 static noinline int alternate_nid(struct kmem_cache *s,
1420 gfp_t gfpflags, int node)
1422 if (in_interrupt() || (gfpflags & __GFP_THISNODE))
1423 return node;
1424 if (cpuset_do_slab_mem_spread() && (s->flags & SLAB_MEM_SPREAD))
1425 return cpuset_mem_spread_node();
1426 else if (current->mempolicy)
1427 return slab_node(current->mempolicy);
1428 return node;
1432 * Allocate an object from a remote node. Return NULL if none could be found
1433 * (in which case, caller should allocate a new slab)
1435 * Must be called with interrupts disabled.
1437 static void *__remote_slab_alloc_node(struct kmem_cache *s,
1438 gfp_t gfpflags, int node)
1440 struct kmem_cache_node *n;
1441 struct kmem_cache_list *l;
1442 void *object;
1444 n = s->node_slab[node];
1445 if (unlikely(!n)) /* node has no memory */
1446 return NULL;
1447 l = &n->list;
1449 spin_lock(&n->list_lock);
1451 object = __cache_list_get_object(s, l);
1452 if (unlikely(!object)) {
1453 object = cache_list_get_page(s, l);
1454 if (unlikely(!object)) {
1455 spin_unlock(&n->list_lock);
1456 return __slab_alloc_page(s, gfpflags, node);
1459 if (likely(object))
1460 slqb_stat_inc(l, ALLOC);
1461 spin_unlock(&n->list_lock);
1462 return object;
1465 static noinline void *__remote_slab_alloc(struct kmem_cache *s,
1466 gfp_t gfpflags, int node)
1468 void *object;
1469 struct zonelist *zonelist;
1470 struct zoneref *z;
1471 struct zone *zone;
1472 enum zone_type high_zoneidx = gfp_zone(gfpflags);
1474 object = __remote_slab_alloc_node(s, gfpflags, node);
1475 if (likely(object || (gfpflags & __GFP_THISNODE)))
1476 return object;
1478 zonelist = node_zonelist(slab_node(current->mempolicy), gfpflags);
1479 for_each_zone_zonelist(zone, z, zonelist, high_zoneidx) {
1480 if (!cpuset_zone_allowed_hardwall(zone, gfpflags))
1481 continue;
1483 node = zone_to_nid(zone);
1484 object = __remote_slab_alloc_node(s, gfpflags, node);
1485 if (likely(object))
1486 return object;
1488 return NULL;
1490 #endif
1493 * Main allocation path. Return an object, or NULL on allocation failure.
1495 * Must be called with interrupts disabled.
1497 static __always_inline void *__slab_alloc(struct kmem_cache *s,
1498 gfp_t gfpflags, int node)
1500 void *object;
1501 struct kmem_cache_cpu *c;
1502 struct kmem_cache_list *l;
1504 #ifdef CONFIG_NUMA
1505 if (unlikely(node != -1) && unlikely(node != numa_node_id())) {
1506 try_remote:
1507 return __remote_slab_alloc(s, gfpflags, node);
1509 #endif
1511 c = get_cpu_slab(s, smp_processor_id());
1512 VM_BUG_ON(!c);
1513 l = &c->list;
1514 object = __cache_list_get_object(s, l);
1515 if (unlikely(!object)) {
1516 #ifdef CONFIG_NUMA
1517 int thisnode = numa_node_id();
1520 * If the local node is memoryless, try remote alloc before
1521 * trying the page allocator. Otherwise, what happens is
1522 * objects are always freed to remote lists but the allocation
1523 * side always allocates a new page with only one object
1524 * used in each page
1526 if (unlikely(!node_state(thisnode, N_HIGH_MEMORY)))
1527 object = __remote_slab_alloc(s, gfpflags, thisnode);
1528 #endif
1530 if (!object) {
1531 object = cache_list_get_page(s, l);
1532 if (unlikely(!object)) {
1533 object = __slab_alloc_page(s, gfpflags, node);
1534 #ifdef CONFIG_NUMA
1535 if (unlikely(!object)) {
1536 node = numa_node_id();
1537 goto try_remote;
1539 #endif
1540 return object;
1544 if (likely(object))
1545 slqb_stat_inc(l, ALLOC);
1546 return object;
1550 * Perform some interrupts-on processing around the main allocation path
1551 * (debug checking and memset()ing).
1553 static __always_inline void *slab_alloc(struct kmem_cache *s,
1554 gfp_t gfpflags, int node, unsigned long addr)
1556 void *object;
1557 unsigned long flags;
1559 gfpflags &= gfp_allowed_mask;
1561 lockdep_trace_alloc(gfpflags);
1562 might_sleep_if(gfpflags & __GFP_WAIT);
1564 if (should_failslab(s->objsize, gfpflags))
1565 return NULL;
1567 again:
1568 local_irq_save(flags);
1569 object = __slab_alloc(s, gfpflags, node);
1570 local_irq_restore(flags);
1572 if (unlikely(slab_debug(s)) && likely(object)) {
1573 if (unlikely(!alloc_debug_processing(s, object, addr)))
1574 goto again;
1577 if (unlikely(gfpflags & __GFP_ZERO) && likely(object))
1578 memset(object, 0, s->objsize);
1580 return object;
1583 static __always_inline void *__kmem_cache_alloc(struct kmem_cache *s,
1584 gfp_t gfpflags, unsigned long caller)
1586 int node = -1;
1588 #ifdef CONFIG_NUMA
1589 if (unlikely(current->flags & (PF_SPREAD_SLAB | PF_MEMPOLICY)))
1590 node = alternate_nid(s, gfpflags, node);
1591 #endif
1592 return slab_alloc(s, gfpflags, node, caller);
1595 void *kmem_cache_alloc(struct kmem_cache *s, gfp_t gfpflags)
1597 return __kmem_cache_alloc(s, gfpflags, _RET_IP_);
1599 EXPORT_SYMBOL(kmem_cache_alloc);
1601 #ifdef CONFIG_NUMA
1602 void *kmem_cache_alloc_node(struct kmem_cache *s, gfp_t gfpflags, int node)
1604 return slab_alloc(s, gfpflags, node, _RET_IP_);
1606 EXPORT_SYMBOL(kmem_cache_alloc_node);
1607 #endif
1609 #ifdef CONFIG_SMP
1611 * Flush this CPU's remote free list of objects back to the list from where
1612 * they originate. They end up on that list's remotely freed list, and
1613 * eventually we set it's remote_free_check if there are enough objects on it.
1615 * This seems convoluted, but it keeps is from stomping on the target CPU's
1616 * fastpath cachelines.
1618 * Must be called with interrupts disabled.
1620 static void flush_remote_free_cache(struct kmem_cache *s,
1621 struct kmem_cache_cpu *c)
1623 struct kmlist *src;
1624 struct kmem_cache_list *dst;
1625 unsigned int nr;
1626 int set;
1628 src = &c->rlist;
1629 nr = src->nr;
1630 if (unlikely(!nr))
1631 return;
1633 #ifdef CONFIG_SLQB_STATS
1635 struct kmem_cache_list *l = &c->list;
1637 slqb_stat_inc(l, FLUSH_RFREE_LIST);
1638 slqb_stat_add(l, FLUSH_RFREE_LIST_OBJECTS, nr);
1640 #endif
1642 dst = c->remote_cache_list;
1645 * Less common case, dst is filling up so free synchronously.
1646 * No point in having remote CPU free thse as it will just
1647 * free them back to the page list anyway.
1649 if (unlikely(dst->remote_free.list.nr > (slab_hiwater(s) >> 1))) {
1650 void **head;
1652 head = src->head;
1653 spin_lock(&dst->page_lock);
1654 do {
1655 struct slqb_page *page;
1656 void **object;
1658 object = head;
1659 VM_BUG_ON(!object);
1660 head = get_freepointer(s, object);
1661 page = virt_to_head_slqb_page(object);
1663 free_object_to_page(s, dst, page, object);
1664 nr--;
1665 } while (nr);
1666 spin_unlock(&dst->page_lock);
1668 src->head = NULL;
1669 src->tail = NULL;
1670 src->nr = 0;
1672 return;
1675 spin_lock(&dst->remote_free.lock);
1677 if (!dst->remote_free.list.head)
1678 dst->remote_free.list.head = src->head;
1679 else
1680 set_freepointer(s, dst->remote_free.list.tail, src->head);
1681 dst->remote_free.list.tail = src->tail;
1683 src->head = NULL;
1684 src->tail = NULL;
1685 src->nr = 0;
1687 if (dst->remote_free.list.nr < slab_freebatch(s))
1688 set = 1;
1689 else
1690 set = 0;
1692 dst->remote_free.list.nr += nr;
1694 if (unlikely(dst->remote_free.list.nr >= slab_freebatch(s) && set))
1695 dst->remote_free_check = 1;
1697 spin_unlock(&dst->remote_free.lock);
1701 * Free an object to this CPU's remote free list.
1703 * Must be called with interrupts disabled.
1705 static noinline void slab_free_to_remote(struct kmem_cache *s,
1706 struct slqb_page *page, void *object,
1707 struct kmem_cache_cpu *c)
1709 struct kmlist *r;
1712 * Our remote free list corresponds to a different list. Must
1713 * flush it and switch.
1715 if (page->list != c->remote_cache_list) {
1716 flush_remote_free_cache(s, c);
1717 c->remote_cache_list = page->list;
1720 r = &c->rlist;
1721 if (!r->head)
1722 r->head = object;
1723 else
1724 set_freepointer(s, r->tail, object);
1725 set_freepointer(s, object, NULL);
1726 r->tail = object;
1727 r->nr++;
1729 if (unlikely(r->nr >= slab_freebatch(s)))
1730 flush_remote_free_cache(s, c);
1732 #endif
1735 * Main freeing path. Return an object, or NULL on allocation failure.
1737 * Must be called with interrupts disabled.
1739 static __always_inline void __slab_free(struct kmem_cache *s,
1740 struct slqb_page *page, void *object)
1742 struct kmem_cache_cpu *c;
1743 struct kmem_cache_list *l;
1744 int thiscpu = smp_processor_id();
1746 c = get_cpu_slab(s, thiscpu);
1747 l = &c->list;
1749 slqb_stat_inc(l, FREE);
1751 if (!NUMA_BUILD || !slab_numa(s) ||
1752 likely(slqb_page_to_nid(page) == numa_node_id())) {
1754 * Freeing fastpath. Collects all local-node objects, not
1755 * just those allocated from our per-CPU list. This allows
1756 * fast transfer of objects from one CPU to another within
1757 * a given node.
1759 set_freepointer(s, object, l->freelist.head);
1760 l->freelist.head = object;
1761 if (!l->freelist.nr)
1762 l->freelist.tail = object;
1763 l->freelist.nr++;
1765 if (unlikely(l->freelist.nr > slab_hiwater(s)))
1766 flush_free_list(s, l);
1768 } else {
1769 #ifdef CONFIG_SMP
1771 * Freeing an object that was allocated on a remote node.
1773 slab_free_to_remote(s, page, object, c);
1774 slqb_stat_inc(l, FREE_REMOTE);
1775 #endif
1780 * Perform some interrupts-on processing around the main freeing path
1781 * (debug checking).
1783 static __always_inline void slab_free(struct kmem_cache *s,
1784 struct slqb_page *page, void *object)
1786 unsigned long flags;
1788 prefetchw(object);
1790 debug_check_no_locks_freed(object, s->objsize);
1791 if (likely(object) && unlikely(slab_debug(s))) {
1792 if (unlikely(!free_debug_processing(s, object, _RET_IP_)))
1793 return;
1796 local_irq_save(flags);
1797 __slab_free(s, page, object);
1798 local_irq_restore(flags);
1801 void kmem_cache_free(struct kmem_cache *s, void *object)
1803 struct slqb_page *page = NULL;
1805 if (slab_numa(s))
1806 page = virt_to_head_slqb_page(object);
1807 slab_free(s, page, object);
1809 EXPORT_SYMBOL(kmem_cache_free);
1812 * Calculate the order of allocation given an slab object size.
1814 * Order 0 allocations are preferred since order 0 does not cause fragmentation
1815 * in the page allocator, and they have fastpaths in the page allocator. But
1816 * also minimise external fragmentation with large objects.
1818 static int slab_order(int size, int max_order, int frac)
1820 int order;
1822 if (fls(size - 1) <= PAGE_SHIFT)
1823 order = 0;
1824 else
1825 order = fls(size - 1) - PAGE_SHIFT;
1826 if (order < slqb_min_order)
1827 order = slqb_min_order;
1829 while (order <= max_order) {
1830 unsigned long slab_size = PAGE_SIZE << order;
1831 unsigned long objects;
1832 unsigned long waste;
1834 objects = slab_size / size;
1835 if (!objects)
1836 goto next;
1838 if (order < MAX_ORDER && objects < slqb_min_objects) {
1840 * if we don't have enough objects for min_objects,
1841 * then try the next size up. Unless we have reached
1842 * our maximum possible page size.
1844 goto next;
1847 waste = slab_size - (objects * size);
1849 if (waste * frac <= slab_size)
1850 break;
1852 next:
1853 order++;
1856 return order;
1859 static int calculate_order(int size)
1861 int order;
1864 * Attempt to find best configuration for a slab. This
1865 * works by first attempting to generate a layout with
1866 * the best configuration and backing off gradually.
1868 order = slab_order(size, 1, 4);
1869 if (order <= 1)
1870 return order;
1873 * This size cannot fit in order-1. Allow bigger orders, but
1874 * forget about trying to save space.
1876 order = slab_order(size, MAX_ORDER - 1, 0);
1877 if (order < MAX_ORDER)
1878 return order;
1880 return -ENOSYS;
1884 * Figure out what the alignment of the objects will be.
1886 static unsigned long calculate_alignment(unsigned long flags,
1887 unsigned long align, unsigned long size)
1890 * If the user wants hardware cache aligned objects then follow that
1891 * suggestion if the object is sufficiently large.
1893 * The hardware cache alignment cannot override the specified
1894 * alignment though. If that is greater then use it.
1896 if (flags & SLAB_HWCACHE_ALIGN) {
1897 unsigned long ralign = cache_line_size();
1899 while (size <= ralign / 2)
1900 ralign /= 2;
1901 align = max(align, ralign);
1904 if (align < ARCH_SLAB_MINALIGN)
1905 align = ARCH_SLAB_MINALIGN;
1907 return ALIGN(align, sizeof(void *));
1910 static void init_kmem_cache_list(struct kmem_cache *s,
1911 struct kmem_cache_list *l)
1913 l->cache = s;
1914 l->freelist.nr = 0;
1915 l->freelist.head = NULL;
1916 l->freelist.tail = NULL;
1917 l->nr_partial = 0;
1918 l->nr_slabs = 0;
1919 INIT_LIST_HEAD(&l->partial);
1920 spin_lock_init(&l->page_lock);
1922 #ifdef CONFIG_SMP
1923 l->remote_free_check = 0;
1924 spin_lock_init(&l->remote_free.lock);
1925 l->remote_free.list.nr = 0;
1926 l->remote_free.list.head = NULL;
1927 l->remote_free.list.tail = NULL;
1928 #endif
1930 #ifdef CONFIG_SLQB_STATS
1931 memset(l->stats, 0, sizeof(l->stats));
1932 #endif
1935 static void init_kmem_cache_cpu(struct kmem_cache *s,
1936 struct kmem_cache_cpu *c)
1938 init_kmem_cache_list(s, &c->list);
1940 c->colour_next = 0;
1941 #ifdef CONFIG_SMP
1942 c->rlist.nr = 0;
1943 c->rlist.head = NULL;
1944 c->rlist.tail = NULL;
1945 c->remote_cache_list = NULL;
1946 #endif
1949 #ifdef CONFIG_NUMA
1950 static void init_kmem_cache_node(struct kmem_cache *s,
1951 struct kmem_cache_node *n)
1953 spin_lock_init(&n->list_lock);
1954 init_kmem_cache_list(s, &n->list);
1956 #endif
1958 /* Initial slabs. */
1959 #ifdef CONFIG_SMP
1960 static DEFINE_PER_CPU(struct kmem_cache_cpu, kmem_cache_cpus);
1961 #endif
1962 #ifdef CONFIG_NUMA
1963 /* XXX: really need a DEFINE_PER_NODE for per-node data because a static
1964 * array is wasteful */
1965 static struct kmem_cache_node kmem_cache_nodes[MAX_NUMNODES];
1966 #endif
1968 #ifdef CONFIG_SMP
1969 static struct kmem_cache kmem_cpu_cache;
1970 static DEFINE_PER_CPU(struct kmem_cache_cpu, kmem_cpu_cpus);
1971 #ifdef CONFIG_NUMA
1972 static struct kmem_cache_node kmem_cpu_nodes[MAX_NUMNODES]; /* XXX per-nid */
1973 #endif
1974 #endif
1976 #ifdef CONFIG_NUMA
1977 static struct kmem_cache kmem_node_cache;
1978 #ifdef CONFIG_SMP
1979 static DEFINE_PER_CPU(struct kmem_cache_cpu, kmem_node_cpus);
1980 #endif
1981 static struct kmem_cache_node kmem_node_nodes[MAX_NUMNODES]; /*XXX per-nid */
1982 #endif
1984 #ifdef CONFIG_SMP
1985 static struct kmem_cache_cpu *alloc_kmem_cache_cpu(struct kmem_cache *s,
1986 int cpu)
1988 struct kmem_cache_cpu *c;
1989 int node;
1991 node = cpu_to_node(cpu);
1993 c = kmem_cache_alloc_node(&kmem_cpu_cache, GFP_KERNEL, node);
1994 if (!c)
1995 return NULL;
1997 init_kmem_cache_cpu(s, c);
1998 return c;
2001 static void free_kmem_cache_cpus(struct kmem_cache *s)
2003 int cpu;
2005 for_each_online_cpu(cpu) {
2006 struct kmem_cache_cpu *c;
2008 c = s->cpu_slab[cpu];
2009 if (c) {
2010 kmem_cache_free(&kmem_cpu_cache, c);
2011 s->cpu_slab[cpu] = NULL;
2016 static int alloc_kmem_cache_cpus(struct kmem_cache *s)
2018 int cpu;
2020 for_each_online_cpu(cpu) {
2021 struct kmem_cache_cpu *c;
2023 c = s->cpu_slab[cpu];
2024 if (c)
2025 continue;
2027 c = alloc_kmem_cache_cpu(s, cpu);
2028 if (!c) {
2029 free_kmem_cache_cpus(s);
2030 return 0;
2032 s->cpu_slab[cpu] = c;
2034 return 1;
2037 #else
2038 static inline void free_kmem_cache_cpus(struct kmem_cache *s)
2042 static inline int alloc_kmem_cache_cpus(struct kmem_cache *s)
2044 init_kmem_cache_cpu(s, &s->cpu_slab);
2045 return 1;
2047 #endif
2049 #ifdef CONFIG_NUMA
2050 static void free_kmem_cache_nodes(struct kmem_cache *s)
2052 int node;
2054 for_each_node_state(node, N_NORMAL_MEMORY) {
2055 struct kmem_cache_node *n;
2057 n = s->node_slab[node];
2058 if (n) {
2059 kmem_cache_free(&kmem_node_cache, n);
2060 s->node_slab[node] = NULL;
2065 static int alloc_kmem_cache_nodes(struct kmem_cache *s)
2067 int node;
2069 for_each_node_state(node, N_NORMAL_MEMORY) {
2070 struct kmem_cache_node *n;
2072 n = kmem_cache_alloc_node(&kmem_node_cache, GFP_KERNEL, node);
2073 if (!n) {
2074 free_kmem_cache_nodes(s);
2075 return 0;
2077 init_kmem_cache_node(s, n);
2078 s->node_slab[node] = n;
2080 return 1;
2082 #else
2083 static void free_kmem_cache_nodes(struct kmem_cache *s)
2087 static int alloc_kmem_cache_nodes(struct kmem_cache *s)
2089 return 1;
2091 #endif
2094 * calculate_sizes() determines the order and the distribution of data within
2095 * a slab object.
2097 static int calculate_sizes(struct kmem_cache *s)
2099 unsigned long flags = s->flags;
2100 unsigned long size = s->objsize;
2101 unsigned long align = s->align;
2104 * Determine if we can poison the object itself. If the user of
2105 * the slab may touch the object after free or before allocation
2106 * then we should never poison the object itself.
2108 if (slab_poison(s) && !(flags & SLAB_DESTROY_BY_RCU) && !s->ctor)
2109 s->flags |= __OBJECT_POISON;
2110 else
2111 s->flags &= ~__OBJECT_POISON;
2114 * Round up object size to the next word boundary. We can only
2115 * place the free pointer at word boundaries and this determines
2116 * the possible location of the free pointer.
2118 size = ALIGN(size, sizeof(void *));
2120 #ifdef CONFIG_SLQB_DEBUG
2122 * If we are Redzoning then check if there is some space between the
2123 * end of the object and the free pointer. If not then add an
2124 * additional word to have some bytes to store Redzone information.
2126 if ((flags & SLAB_RED_ZONE) && size == s->objsize)
2127 size += sizeof(void *);
2128 #endif
2131 * With that we have determined the number of bytes in actual use
2132 * by the object. This is the potential offset to the free pointer.
2134 s->inuse = size;
2136 if (((flags & (SLAB_DESTROY_BY_RCU | SLAB_POISON)) || s->ctor)) {
2138 * Relocate free pointer after the object if it is not
2139 * permitted to overwrite the first word of the object on
2140 * kmem_cache_free.
2142 * This is the case if we do RCU, have a constructor or
2143 * destructor or are poisoning the objects.
2145 s->offset = size;
2146 size += sizeof(void *);
2149 #ifdef CONFIG_SLQB_DEBUG
2150 if (flags & SLAB_STORE_USER) {
2152 * Need to store information about allocs and frees after
2153 * the object.
2155 size += 2 * sizeof(struct track);
2158 if (flags & SLAB_RED_ZONE) {
2160 * Add some empty padding so that we can catch
2161 * overwrites from earlier objects rather than let
2162 * tracking information or the free pointer be
2163 * corrupted if an user writes before the start
2164 * of the object.
2166 size += sizeof(void *);
2168 #endif
2171 * Determine the alignment based on various parameters that the
2172 * user specified and the dynamic determination of cache line size
2173 * on bootup.
2175 align = calculate_alignment(flags, align, s->objsize);
2178 * SLQB stores one object immediately after another beginning from
2179 * offset 0. In order to align the objects we have to simply size
2180 * each object to conform to the alignment.
2182 size = ALIGN(size, align);
2183 s->size = size;
2184 s->order = calculate_order(size);
2186 if (s->order < 0)
2187 return 0;
2189 s->allocflags = 0;
2190 if (s->order)
2191 s->allocflags |= __GFP_COMP;
2193 if (s->flags & SLAB_CACHE_DMA)
2194 s->allocflags |= SLQB_DMA;
2196 if (s->flags & SLAB_RECLAIM_ACCOUNT)
2197 s->allocflags |= __GFP_RECLAIMABLE;
2200 * Determine the number of objects per slab
2202 s->objects = (PAGE_SIZE << s->order) / size;
2204 s->freebatch = max(4UL*PAGE_SIZE / size,
2205 min(256UL, 64*PAGE_SIZE / size));
2206 if (!s->freebatch)
2207 s->freebatch = 1;
2208 s->hiwater = s->freebatch << 2;
2210 return !!s->objects;
2214 #ifdef CONFIG_SMP
2216 * Per-cpu allocator can't be used because it always uses slab allocator,
2217 * and it can't do per-node allocations.
2219 static void *kmem_cache_dyn_array_alloc(int ids)
2221 size_t size = sizeof(void *) * ids;
2223 BUG_ON(!size);
2225 if (unlikely(!slab_is_available())) {
2226 static void *nextmem;
2227 static size_t nextleft;
2228 void *ret;
2231 * Special case for setting up initial caches. These will
2232 * never get freed by definition so we can do it rather
2233 * simply.
2235 if (size > nextleft) {
2236 nextmem = alloc_pages_exact(size, GFP_KERNEL);
2237 if (!nextmem)
2238 return NULL;
2239 nextleft = roundup(size, PAGE_SIZE);
2242 ret = nextmem;
2243 nextleft -= size;
2244 nextmem += size;
2245 memset(ret, 0, size);
2246 return ret;
2247 } else {
2248 return kzalloc(size, GFP_KERNEL);
2252 static void kmem_cache_dyn_array_free(void *array)
2254 if (unlikely(!slab_is_available()))
2255 return; /* error case without crashing here (will panic soon) */
2256 kfree(array);
2258 #endif
2261 * Except in early boot, this should be called with slqb_lock held for write
2262 * to lock out hotplug, and protect list modifications.
2264 static int kmem_cache_open(struct kmem_cache *s,
2265 const char *name, size_t size, size_t align,
2266 unsigned long flags, void (*ctor)(void *), int alloc)
2268 unsigned int left_over;
2270 memset(s, 0, sizeof(struct kmem_cache));
2271 s->name = name;
2272 s->ctor = ctor;
2273 s->objsize = size;
2274 s->align = align;
2275 s->flags = kmem_cache_flags(size, flags, name, ctor);
2277 if (!calculate_sizes(s))
2278 goto error;
2280 if (!slab_debug(s)) {
2281 left_over = (PAGE_SIZE << s->order) - (s->objects * s->size);
2282 s->colour_off = max(cache_line_size(), s->align);
2283 s->colour_range = left_over;
2284 } else {
2285 s->colour_off = 0;
2286 s->colour_range = 0;
2289 #ifdef CONFIG_SMP
2290 s->cpu_slab = kmem_cache_dyn_array_alloc(nr_cpu_ids);
2291 if (!s->cpu_slab)
2292 goto error;
2293 # ifdef CONFIG_NUMA
2294 s->node_slab = kmem_cache_dyn_array_alloc(nr_node_ids);
2295 if (!s->node_slab)
2296 goto error_cpu_array;
2297 # endif
2298 #endif
2300 if (likely(alloc)) {
2301 if (!alloc_kmem_cache_nodes(s))
2302 goto error_node_array;
2304 if (!alloc_kmem_cache_cpus(s))
2305 goto error_nodes;
2308 sysfs_slab_add(s);
2309 list_add(&s->list, &slab_caches);
2311 return 1;
2313 error_nodes:
2314 free_kmem_cache_nodes(s);
2315 error_node_array:
2316 #if defined(CONFIG_NUMA) && defined(CONFIG_SMP)
2317 kmem_cache_dyn_array_free(s->node_slab);
2318 error_cpu_array:
2319 #endif
2320 #ifdef CONFIG_SMP
2321 kmem_cache_dyn_array_free(s->cpu_slab);
2322 #endif
2323 error:
2324 if (flags & SLAB_PANIC)
2325 panic("%s: failed to create slab `%s'\n", __func__, name);
2326 return 0;
2330 * kmem_ptr_validate - check if an untrusted pointer might be a slab entry.
2331 * @s: the cache we're checking against
2332 * @ptr: pointer to validate
2334 * This verifies that the untrusted pointer looks sane;
2335 * it is _not_ a guarantee that the pointer is actually
2336 * part of the slab cache in question, but it at least
2337 * validates that the pointer can be dereferenced and
2338 * looks half-way sane.
2340 * Currently only used for dentry validation.
2342 int kmem_ptr_validate(struct kmem_cache *s, const void *ptr)
2344 unsigned long addr = (unsigned long)ptr;
2345 struct slqb_page *page;
2347 if (unlikely(addr < PAGE_OFFSET))
2348 goto out;
2349 if (unlikely(addr > (unsigned long)high_memory - s->size))
2350 goto out;
2351 if (unlikely(!IS_ALIGNED(addr, s->align)))
2352 goto out;
2353 if (unlikely(!kern_addr_valid(addr)))
2354 goto out;
2355 if (unlikely(!kern_addr_valid(addr + s->size - 1)))
2356 goto out;
2357 if (unlikely(!pfn_valid(addr >> PAGE_SHIFT)))
2358 goto out;
2359 page = virt_to_head_slqb_page(ptr);
2360 if (unlikely(!(page->flags & PG_SLQB_BIT)))
2361 goto out;
2362 if (unlikely(page->list->cache != s)) /* XXX: ouch, racy */
2363 goto out;
2364 return 1;
2365 out:
2366 return 0;
2368 EXPORT_SYMBOL(kmem_ptr_validate);
2371 * Determine the size of a slab object
2373 unsigned int kmem_cache_size(struct kmem_cache *s)
2375 return s->objsize;
2377 EXPORT_SYMBOL(kmem_cache_size);
2379 const char *kmem_cache_name(struct kmem_cache *s)
2381 return s->name;
2383 EXPORT_SYMBOL(kmem_cache_name);
2386 * Release all resources used by a slab cache. No more concurrency on the
2387 * slab, so we can touch remote kmem_cache_cpu structures.
2389 void kmem_cache_destroy(struct kmem_cache *s)
2391 #ifdef CONFIG_NUMA
2392 int node;
2393 #endif
2394 int cpu;
2396 down_write(&slqb_lock);
2397 list_del(&s->list);
2399 local_irq_disable();
2400 #ifdef CONFIG_SMP
2401 for_each_online_cpu(cpu) {
2402 struct kmem_cache_cpu *c = get_cpu_slab(s, cpu);
2403 struct kmem_cache_list *l = &c->list;
2405 flush_free_list_all(s, l);
2406 flush_remote_free_cache(s, c);
2408 #endif
2410 for_each_online_cpu(cpu) {
2411 struct kmem_cache_cpu *c = get_cpu_slab(s, cpu);
2412 struct kmem_cache_list *l = &c->list;
2414 claim_remote_free_list(s, l);
2415 flush_free_list_all(s, l);
2417 WARN_ON(l->freelist.nr);
2418 WARN_ON(l->nr_slabs);
2419 WARN_ON(l->nr_partial);
2422 free_kmem_cache_cpus(s);
2424 #ifdef CONFIG_NUMA
2425 for_each_node_state(node, N_NORMAL_MEMORY) {
2426 struct kmem_cache_node *n;
2427 struct kmem_cache_list *l;
2429 n = s->node_slab[node];
2430 if (!n)
2431 continue;
2432 l = &n->list;
2434 claim_remote_free_list(s, l);
2435 flush_free_list_all(s, l);
2437 WARN_ON(l->freelist.nr);
2438 WARN_ON(l->nr_slabs);
2439 WARN_ON(l->nr_partial);
2442 free_kmem_cache_nodes(s);
2443 #endif
2444 local_irq_enable();
2446 sysfs_slab_remove(s);
2447 up_write(&slqb_lock);
2449 EXPORT_SYMBOL(kmem_cache_destroy);
2451 /********************************************************************
2452 * Kmalloc subsystem
2453 *******************************************************************/
2455 struct kmem_cache kmalloc_caches[KMALLOC_SHIFT_SLQB_HIGH + 1] __cacheline_aligned;
2456 EXPORT_SYMBOL(kmalloc_caches);
2458 #ifdef CONFIG_ZONE_DMA
2459 struct kmem_cache kmalloc_caches_dma[KMALLOC_SHIFT_SLQB_HIGH + 1] __cacheline_aligned;
2460 EXPORT_SYMBOL(kmalloc_caches_dma);
2461 #endif
2463 #ifndef ARCH_KMALLOC_FLAGS
2464 #define ARCH_KMALLOC_FLAGS SLAB_HWCACHE_ALIGN
2465 #endif
2467 static struct kmem_cache *open_kmalloc_cache(struct kmem_cache *s,
2468 const char *name, int size, gfp_t gfp_flags)
2470 unsigned int flags = ARCH_KMALLOC_FLAGS | SLAB_PANIC;
2472 if (gfp_flags & SLQB_DMA)
2473 flags |= SLAB_CACHE_DMA;
2475 kmem_cache_open(s, name, size, ARCH_KMALLOC_MINALIGN, flags, NULL, 1);
2477 return s;
2481 * Conversion table for small slabs sizes / 8 to the index in the
2482 * kmalloc array. This is necessary for slabs < 192 since we have non power
2483 * of two cache sizes there. The size of larger slabs can be determined using
2484 * fls.
2486 static s8 size_index[24] __cacheline_aligned = {
2487 3, /* 8 */
2488 4, /* 16 */
2489 5, /* 24 */
2490 5, /* 32 */
2491 6, /* 40 */
2492 6, /* 48 */
2493 6, /* 56 */
2494 6, /* 64 */
2495 #if L1_CACHE_BYTES < 64
2496 1, /* 72 */
2497 1, /* 80 */
2498 1, /* 88 */
2499 1, /* 96 */
2500 #else
2505 #endif
2506 7, /* 104 */
2507 7, /* 112 */
2508 7, /* 120 */
2509 7, /* 128 */
2510 #if L1_CACHE_BYTES < 128
2511 2, /* 136 */
2512 2, /* 144 */
2513 2, /* 152 */
2514 2, /* 160 */
2515 2, /* 168 */
2516 2, /* 176 */
2517 2, /* 184 */
2518 2 /* 192 */
2519 #else
2528 #endif
2531 static struct kmem_cache *get_slab(size_t size, gfp_t flags)
2533 int index;
2535 if (unlikely(size <= KMALLOC_MIN_SIZE)) {
2536 if (unlikely(!size))
2537 return ZERO_SIZE_PTR;
2539 index = KMALLOC_SHIFT_LOW;
2540 goto got_index;
2543 #if L1_CACHE_BYTES >= 128
2544 if (size <= 128) {
2545 #else
2546 if (size <= 192) {
2547 #endif
2548 index = size_index[(size - 1) / 8];
2549 } else {
2550 if (unlikely(size > 1UL << KMALLOC_SHIFT_SLQB_HIGH))
2551 return NULL;
2553 index = fls(size - 1);
2556 got_index:
2557 if (unlikely((flags & SLQB_DMA)))
2558 return &kmalloc_caches_dma[index];
2559 else
2560 return &kmalloc_caches[index];
2563 void *__kmalloc(size_t size, gfp_t flags)
2565 struct kmem_cache *s;
2567 s = get_slab(size, flags);
2568 if (unlikely(ZERO_OR_NULL_PTR(s)))
2569 return s;
2571 return __kmem_cache_alloc(s, flags, _RET_IP_);
2573 EXPORT_SYMBOL(__kmalloc);
2575 #ifdef CONFIG_NUMA
2576 void *__kmalloc_node(size_t size, gfp_t flags, int node)
2578 struct kmem_cache *s;
2580 s = get_slab(size, flags);
2581 if (unlikely(ZERO_OR_NULL_PTR(s)))
2582 return s;
2584 return kmem_cache_alloc_node(s, flags, node);
2586 EXPORT_SYMBOL(__kmalloc_node);
2587 #endif
2589 size_t ksize(const void *object)
2591 struct slqb_page *page;
2592 struct kmem_cache *s;
2594 BUG_ON(!object);
2595 if (unlikely(object == ZERO_SIZE_PTR))
2596 return 0;
2598 page = virt_to_head_slqb_page(object);
2599 BUG_ON(!(page->flags & PG_SLQB_BIT));
2601 s = page->list->cache;
2604 * Debugging requires use of the padding between object
2605 * and whatever may come after it.
2607 if (s->flags & (SLAB_RED_ZONE | SLAB_POISON))
2608 return s->objsize;
2611 * If we have the need to store the freelist pointer
2612 * back there or track user information then we can
2613 * only use the space before that information.
2615 if (s->flags & (SLAB_DESTROY_BY_RCU | SLAB_STORE_USER))
2616 return s->inuse;
2619 * Else we can use all the padding etc for the allocation
2621 return s->size;
2623 EXPORT_SYMBOL(ksize);
2625 void kfree(const void *object)
2627 struct kmem_cache *s;
2628 struct slqb_page *page;
2630 if (unlikely(ZERO_OR_NULL_PTR(object)))
2631 return;
2633 page = virt_to_head_slqb_page(object);
2634 s = page->list->cache;
2636 slab_free(s, page, (void *)object);
2638 EXPORT_SYMBOL(kfree);
2640 static void kmem_cache_trim_percpu(void *arg)
2642 int cpu = smp_processor_id();
2643 struct kmem_cache *s = arg;
2644 struct kmem_cache_cpu *c = get_cpu_slab(s, cpu);
2645 struct kmem_cache_list *l = &c->list;
2647 claim_remote_free_list(s, l);
2648 flush_free_list(s, l);
2649 #ifdef CONFIG_SMP
2650 flush_remote_free_cache(s, c);
2651 #endif
2654 int kmem_cache_shrink(struct kmem_cache *s)
2656 #ifdef CONFIG_NUMA
2657 int node;
2658 #endif
2660 on_each_cpu(kmem_cache_trim_percpu, s, 1);
2662 #ifdef CONFIG_NUMA
2663 for_each_node_state(node, N_NORMAL_MEMORY) {
2664 struct kmem_cache_node *n;
2665 struct kmem_cache_list *l;
2667 n = s->node_slab[node];
2668 if (!n)
2669 continue;
2670 l = &n->list;
2672 spin_lock_irq(&n->list_lock);
2673 claim_remote_free_list(s, l);
2674 flush_free_list(s, l);
2675 spin_unlock_irq(&n->list_lock);
2677 #endif
2679 return 0;
2681 EXPORT_SYMBOL(kmem_cache_shrink);
2683 #if defined(CONFIG_NUMA) && defined(CONFIG_MEMORY_HOTPLUG)
2684 static void kmem_cache_reap_percpu(void *arg)
2686 int cpu = smp_processor_id();
2687 struct kmem_cache *s;
2688 long phase = (long)arg;
2690 list_for_each_entry(s, &slab_caches, list) {
2691 struct kmem_cache_cpu *c = get_cpu_slab(s, cpu);
2692 struct kmem_cache_list *l = &c->list;
2694 if (phase == 0) {
2695 flush_free_list_all(s, l);
2696 flush_remote_free_cache(s, c);
2699 if (phase == 1) {
2700 claim_remote_free_list(s, l);
2701 flush_free_list_all(s, l);
2706 static void kmem_cache_reap(void)
2708 struct kmem_cache *s;
2709 int node;
2711 down_read(&slqb_lock);
2712 on_each_cpu(kmem_cache_reap_percpu, (void *)0, 1);
2713 on_each_cpu(kmem_cache_reap_percpu, (void *)1, 1);
2715 list_for_each_entry(s, &slab_caches, list) {
2716 for_each_node_state(node, N_NORMAL_MEMORY) {
2717 struct kmem_cache_node *n;
2718 struct kmem_cache_list *l;
2720 n = s->node_slab[node];
2721 if (!n)
2722 continue;
2723 l = &n->list;
2725 spin_lock_irq(&n->list_lock);
2726 claim_remote_free_list(s, l);
2727 flush_free_list_all(s, l);
2728 spin_unlock_irq(&n->list_lock);
2731 up_read(&slqb_lock);
2733 #endif
2735 static void cache_trim_worker(struct work_struct *w)
2737 struct delayed_work *work =
2738 container_of(w, struct delayed_work, work);
2739 struct kmem_cache *s;
2741 if (!down_read_trylock(&slqb_lock))
2742 goto out;
2744 list_for_each_entry(s, &slab_caches, list) {
2745 #ifdef CONFIG_NUMA
2746 int node = numa_node_id();
2747 struct kmem_cache_node *n = s->node_slab[node];
2749 if (n) {
2750 struct kmem_cache_list *l = &n->list;
2752 spin_lock_irq(&n->list_lock);
2753 claim_remote_free_list(s, l);
2754 flush_free_list(s, l);
2755 spin_unlock_irq(&n->list_lock);
2757 #endif
2759 local_irq_disable();
2760 kmem_cache_trim_percpu(s);
2761 local_irq_enable();
2764 up_read(&slqb_lock);
2765 out:
2766 schedule_delayed_work(work, round_jiffies_relative(3*HZ));
2769 static DEFINE_PER_CPU(struct delayed_work, cache_trim_work);
2771 static void __cpuinit start_cpu_timer(int cpu)
2773 struct delayed_work *cache_trim_work = &per_cpu(cache_trim_work, cpu);
2776 * When this gets called from do_initcalls via cpucache_init(),
2777 * init_workqueues() has already run, so keventd will be setup
2778 * at that time.
2780 if (keventd_up() && cache_trim_work->work.func == NULL) {
2781 INIT_DELAYED_WORK(cache_trim_work, cache_trim_worker);
2782 schedule_delayed_work_on(cpu, cache_trim_work,
2783 __round_jiffies_relative(HZ, cpu));
2787 static int __init cpucache_init(void)
2789 int cpu;
2791 for_each_online_cpu(cpu)
2792 start_cpu_timer(cpu);
2794 return 0;
2796 device_initcall(cpucache_init);
2798 #if defined(CONFIG_NUMA) && defined(CONFIG_MEMORY_HOTPLUG)
2799 static void slab_mem_going_offline_callback(void *arg)
2801 kmem_cache_reap();
2804 static void slab_mem_offline_callback(void *arg)
2806 /* XXX: should release structures, see CPU offline comment */
2809 static int slab_mem_going_online_callback(void *arg)
2811 struct kmem_cache *s;
2812 struct kmem_cache_node *n;
2813 struct memory_notify *marg = arg;
2814 int nid = marg->status_change_nid;
2815 int ret = 0;
2818 * If the node's memory is already available, then kmem_cache_node is
2819 * already created. Nothing to do.
2821 if (nid < 0)
2822 return 0;
2825 * We are bringing a node online. No memory is availabe yet. We must
2826 * allocate a kmem_cache_node structure in order to bring the node
2827 * online.
2829 down_write(&slqb_lock);
2830 list_for_each_entry(s, &slab_caches, list) {
2832 * XXX: kmem_cache_alloc_node will fallback to other nodes
2833 * since memory is not yet available from the node that
2834 * is brought up.
2836 if (s->node_slab[nid]) /* could be lefover from last online */
2837 continue;
2838 n = kmem_cache_alloc(&kmem_node_cache, GFP_KERNEL);
2839 if (!n) {
2840 ret = -ENOMEM;
2841 goto out;
2843 init_kmem_cache_node(s, n);
2844 s->node_slab[nid] = n;
2846 out:
2847 up_write(&slqb_lock);
2848 return ret;
2851 static int slab_memory_callback(struct notifier_block *self,
2852 unsigned long action, void *arg)
2854 int ret = 0;
2856 switch (action) {
2857 case MEM_GOING_ONLINE:
2858 ret = slab_mem_going_online_callback(arg);
2859 break;
2860 case MEM_GOING_OFFLINE:
2861 slab_mem_going_offline_callback(arg);
2862 break;
2863 case MEM_OFFLINE:
2864 case MEM_CANCEL_ONLINE:
2865 slab_mem_offline_callback(arg);
2866 break;
2867 case MEM_ONLINE:
2868 case MEM_CANCEL_OFFLINE:
2869 break;
2872 if (ret)
2873 ret = notifier_from_errno(ret);
2874 else
2875 ret = NOTIFY_OK;
2876 return ret;
2879 #endif /* CONFIG_MEMORY_HOTPLUG */
2881 /********************************************************************
2882 * Basic setup of slabs
2883 *******************************************************************/
2885 void __init kmem_cache_init(void)
2887 int i;
2888 unsigned int flags = SLAB_HWCACHE_ALIGN|SLAB_PANIC;
2891 * All the ifdefs are rather ugly here, but it's just the setup code,
2892 * so it doesn't have to be too readable :)
2896 * No need to take slqb_lock here: there should be no concurrency
2897 * anyway, and spin_unlock_irq in rwsem code could enable interrupts
2898 * too early.
2900 kmem_cache_open(&kmem_cache_cache, "kmem_cache",
2901 sizeof(struct kmem_cache), 0, flags, NULL, 0);
2902 #ifdef CONFIG_SMP
2903 kmem_cache_open(&kmem_cpu_cache, "kmem_cache_cpu",
2904 sizeof(struct kmem_cache_cpu), 0, flags, NULL, 0);
2905 #endif
2906 #ifdef CONFIG_NUMA
2907 kmem_cache_open(&kmem_node_cache, "kmem_cache_node",
2908 sizeof(struct kmem_cache_node), 0, flags, NULL, 0);
2909 #endif
2911 #ifdef CONFIG_SMP
2912 for_each_possible_cpu(i) {
2913 struct kmem_cache_cpu *c;
2915 c = &per_cpu(kmem_cache_cpus, i);
2916 init_kmem_cache_cpu(&kmem_cache_cache, c);
2917 kmem_cache_cache.cpu_slab[i] = c;
2919 c = &per_cpu(kmem_cpu_cpus, i);
2920 init_kmem_cache_cpu(&kmem_cpu_cache, c);
2921 kmem_cpu_cache.cpu_slab[i] = c;
2923 #ifdef CONFIG_NUMA
2924 c = &per_cpu(kmem_node_cpus, i);
2925 init_kmem_cache_cpu(&kmem_node_cache, c);
2926 kmem_node_cache.cpu_slab[i] = c;
2927 #endif
2929 #else
2930 init_kmem_cache_cpu(&kmem_cache_cache, &kmem_cache_cache.cpu_slab);
2931 #endif
2933 #ifdef CONFIG_NUMA
2934 for_each_node_state(i, N_NORMAL_MEMORY) {
2935 struct kmem_cache_node *n;
2937 n = &kmem_cache_nodes[i];
2938 init_kmem_cache_node(&kmem_cache_cache, n);
2939 kmem_cache_cache.node_slab[i] = n;
2940 #ifdef CONFIG_SMP
2941 n = &kmem_cpu_nodes[i];
2942 init_kmem_cache_node(&kmem_cpu_cache, n);
2943 kmem_cpu_cache.node_slab[i] = n;
2944 #endif
2945 n = &kmem_node_nodes[i];
2946 init_kmem_cache_node(&kmem_node_cache, n);
2947 kmem_node_cache.node_slab[i] = n;
2949 #endif
2951 /* Caches that are not of the two-to-the-power-of size */
2952 if (L1_CACHE_BYTES < 64 && KMALLOC_MIN_SIZE <= 64) {
2953 open_kmalloc_cache(&kmalloc_caches[1],
2954 "kmalloc-96", 96, GFP_KERNEL);
2955 #ifdef CONFIG_ZONE_DMA
2956 open_kmalloc_cache(&kmalloc_caches_dma[1],
2957 "kmalloc_dma-96", 96, GFP_KERNEL|SLQB_DMA);
2958 #endif
2960 if (L1_CACHE_BYTES < 128 && KMALLOC_MIN_SIZE <= 128) {
2961 open_kmalloc_cache(&kmalloc_caches[2],
2962 "kmalloc-192", 192, GFP_KERNEL);
2963 #ifdef CONFIG_ZONE_DMA
2964 open_kmalloc_cache(&kmalloc_caches_dma[2],
2965 "kmalloc_dma-192", 192, GFP_KERNEL|SLQB_DMA);
2966 #endif
2969 for (i = KMALLOC_SHIFT_LOW; i <= KMALLOC_SHIFT_SLQB_HIGH; i++) {
2970 open_kmalloc_cache(&kmalloc_caches[i],
2971 "kmalloc", 1 << i, GFP_KERNEL);
2972 #ifdef CONFIG_ZONE_DMA
2973 open_kmalloc_cache(&kmalloc_caches_dma[i],
2974 "kmalloc_dma", 1 << i, GFP_KERNEL|SLQB_DMA);
2975 #endif
2979 * Patch up the size_index table if we have strange large alignment
2980 * requirements for the kmalloc array. This is only the case for
2981 * mips it seems. The standard arches will not generate any code here.
2983 * Largest permitted alignment is 256 bytes due to the way we
2984 * handle the index determination for the smaller caches.
2986 * Make sure that nothing crazy happens if someone starts tinkering
2987 * around with ARCH_KMALLOC_MINALIGN
2989 BUILD_BUG_ON(KMALLOC_MIN_SIZE > 256 ||
2990 (KMALLOC_MIN_SIZE & (KMALLOC_MIN_SIZE - 1)));
2992 for (i = 8; i < KMALLOC_MIN_SIZE; i += 8)
2993 size_index[(i - 1) / 8] = KMALLOC_SHIFT_LOW;
2995 /* Provide the correct kmalloc names now that the caches are up */
2996 for (i = KMALLOC_SHIFT_LOW; i <= KMALLOC_SHIFT_SLQB_HIGH; i++) {
2997 kmalloc_caches[i].name =
2998 kasprintf(GFP_KERNEL, "kmalloc-%d", 1 << i);
2999 #ifdef CONFIG_ZONE_DMA
3000 kmalloc_caches_dma[i].name =
3001 kasprintf(GFP_KERNEL, "kmalloc_dma-%d", 1 << i);
3002 #endif
3005 #ifdef CONFIG_SMP
3006 register_cpu_notifier(&slab_notifier);
3007 #endif
3008 #ifdef CONFIG_NUMA
3009 hotplug_memory_notifier(slab_memory_callback, 1);
3010 #endif
3012 * smp_init() has not yet been called, so no worries about memory
3013 * ordering with __slab_is_available.
3015 __slab_is_available = 1;
3018 void __init kmem_cache_init_late(void)
3023 * Some basic slab creation sanity checks
3025 static int kmem_cache_create_ok(const char *name, size_t size,
3026 size_t align, unsigned long flags)
3028 struct kmem_cache *tmp;
3031 * Sanity checks... these are all serious usage bugs.
3033 if (!name || in_interrupt() || (size < sizeof(void *))) {
3034 printk(KERN_ERR "kmem_cache_create(): early error in slab %s\n",
3035 name);
3036 dump_stack();
3038 return 0;
3041 list_for_each_entry(tmp, &slab_caches, list) {
3042 char x;
3043 int res;
3046 * This happens when the module gets unloaded and doesn't
3047 * destroy its slab cache and no-one else reuses the vmalloc
3048 * area of the module. Print a warning.
3050 res = probe_kernel_address(tmp->name, x);
3051 if (res) {
3052 printk(KERN_ERR
3053 "SLAB: cache with size %d has lost its name\n",
3054 tmp->size);
3055 continue;
3058 if (!strcmp(tmp->name, name)) {
3059 printk(KERN_ERR
3060 "SLAB: duplicate cache %s\n", name);
3061 dump_stack();
3063 return 0;
3067 WARN_ON(strchr(name, ' ')); /* It confuses parsers */
3068 if (flags & SLAB_DESTROY_BY_RCU)
3069 WARN_ON(flags & SLAB_POISON);
3071 return 1;
3074 struct kmem_cache *kmem_cache_create(const char *name, size_t size,
3075 size_t align, unsigned long flags, void (*ctor)(void *))
3077 struct kmem_cache *s;
3079 down_write(&slqb_lock);
3080 if (!kmem_cache_create_ok(name, size, align, flags))
3081 goto err;
3083 s = kmem_cache_alloc(&kmem_cache_cache, GFP_KERNEL);
3084 if (!s)
3085 goto err;
3087 if (kmem_cache_open(s, name, size, align, flags, ctor, 1)) {
3088 up_write(&slqb_lock);
3089 return s;
3092 kmem_cache_free(&kmem_cache_cache, s);
3094 err:
3095 up_write(&slqb_lock);
3096 if (flags & SLAB_PANIC)
3097 panic("%s: failed to create slab `%s'\n", __func__, name);
3099 return NULL;
3101 EXPORT_SYMBOL(kmem_cache_create);
3103 #ifdef CONFIG_SMP
3105 * Use the cpu notifier to insure that the cpu slabs are flushed when
3106 * necessary.
3108 static int __cpuinit slab_cpuup_callback(struct notifier_block *nfb,
3109 unsigned long action, void *hcpu)
3111 long cpu = (long)hcpu;
3112 struct kmem_cache *s;
3114 switch (action) {
3115 case CPU_UP_PREPARE:
3116 case CPU_UP_PREPARE_FROZEN:
3117 down_write(&slqb_lock);
3118 list_for_each_entry(s, &slab_caches, list) {
3119 if (s->cpu_slab[cpu]) /* could be lefover last online */
3120 continue;
3121 s->cpu_slab[cpu] = alloc_kmem_cache_cpu(s, cpu);
3122 if (!s->cpu_slab[cpu]) {
3123 up_read(&slqb_lock);
3124 return NOTIFY_BAD;
3127 up_write(&slqb_lock);
3128 break;
3130 case CPU_ONLINE:
3131 case CPU_ONLINE_FROZEN:
3132 case CPU_DOWN_FAILED:
3133 case CPU_DOWN_FAILED_FROZEN:
3134 start_cpu_timer(cpu);
3135 break;
3137 case CPU_DOWN_PREPARE:
3138 case CPU_DOWN_PREPARE_FROZEN:
3139 cancel_rearming_delayed_work(&per_cpu(cache_trim_work, cpu));
3140 per_cpu(cache_trim_work, cpu).work.func = NULL;
3141 break;
3143 case CPU_UP_CANCELED:
3144 case CPU_UP_CANCELED_FROZEN:
3145 case CPU_DEAD:
3146 case CPU_DEAD_FROZEN:
3148 * XXX: Freeing here doesn't work because objects can still be
3149 * on this CPU's list. periodic timer needs to check if a CPU
3150 * is offline and then try to cleanup from there. Same for node
3151 * offline.
3153 default:
3154 break;
3156 return NOTIFY_OK;
3159 static struct notifier_block __cpuinitdata slab_notifier = {
3160 .notifier_call = slab_cpuup_callback
3163 #endif
3165 #ifdef CONFIG_SLQB_DEBUG
3166 void *__kmalloc_track_caller(size_t size, gfp_t flags, unsigned long caller)
3168 struct kmem_cache *s;
3169 int node = -1;
3171 s = get_slab(size, flags);
3172 if (unlikely(ZERO_OR_NULL_PTR(s)))
3173 return s;
3175 #ifdef CONFIG_NUMA
3176 if (unlikely(current->flags & (PF_SPREAD_SLAB | PF_MEMPOLICY)))
3177 node = alternate_nid(s, flags, node);
3178 #endif
3179 return slab_alloc(s, flags, node, caller);
3182 void *__kmalloc_node_track_caller(size_t size, gfp_t flags, int node,
3183 unsigned long caller)
3185 struct kmem_cache *s;
3187 s = get_slab(size, flags);
3188 if (unlikely(ZERO_OR_NULL_PTR(s)))
3189 return s;
3191 return slab_alloc(s, flags, node, caller);
3193 #endif
3195 #if defined(CONFIG_SLQB_SYSFS) || defined(CONFIG_SLABINFO)
3196 struct stats_gather {
3197 struct kmem_cache *s;
3198 spinlock_t lock;
3199 unsigned long nr_slabs;
3200 unsigned long nr_partial;
3201 unsigned long nr_inuse;
3202 unsigned long nr_objects;
3204 #ifdef CONFIG_SLQB_STATS
3205 unsigned long stats[NR_SLQB_STAT_ITEMS];
3206 #endif
3209 static void __gather_stats(void *arg)
3211 unsigned long nr_slabs;
3212 unsigned long nr_partial;
3213 unsigned long nr_inuse;
3214 struct stats_gather *gather = arg;
3215 int cpu = smp_processor_id();
3216 struct kmem_cache *s = gather->s;
3217 struct kmem_cache_cpu *c = get_cpu_slab(s, cpu);
3218 struct kmem_cache_list *l = &c->list;
3219 struct slqb_page *page;
3220 #ifdef CONFIG_SLQB_STATS
3221 int i;
3222 #endif
3224 spin_lock(&l->page_lock);
3225 nr_slabs = l->nr_slabs;
3226 nr_partial = l->nr_partial;
3227 nr_inuse = (nr_slabs - nr_partial) * s->objects;
3229 list_for_each_entry(page, &l->partial, lru) {
3230 nr_inuse += page->inuse;
3232 spin_unlock(&l->page_lock);
3234 spin_lock(&gather->lock);
3235 gather->nr_slabs += nr_slabs;
3236 gather->nr_partial += nr_partial;
3237 gather->nr_inuse += nr_inuse;
3238 #ifdef CONFIG_SLQB_STATS
3239 for (i = 0; i < NR_SLQB_STAT_ITEMS; i++)
3240 gather->stats[i] += l->stats[i];
3241 #endif
3242 spin_unlock(&gather->lock);
3245 /* must be called with slqb_lock held */
3246 static void gather_stats_locked(struct kmem_cache *s,
3247 struct stats_gather *stats)
3249 #ifdef CONFIG_NUMA
3250 int node;
3251 #endif
3253 memset(stats, 0, sizeof(struct stats_gather));
3254 stats->s = s;
3255 spin_lock_init(&stats->lock);
3257 on_each_cpu(__gather_stats, stats, 1);
3259 #ifdef CONFIG_NUMA
3260 for_each_online_node(node) {
3261 struct kmem_cache_node *n = s->node_slab[node];
3262 struct kmem_cache_list *l = &n->list;
3263 struct slqb_page *page;
3264 unsigned long flags;
3265 #ifdef CONFIG_SLQB_STATS
3266 int i;
3267 #endif
3269 spin_lock_irqsave(&n->list_lock, flags);
3270 #ifdef CONFIG_SLQB_STATS
3271 for (i = 0; i < NR_SLQB_STAT_ITEMS; i++)
3272 stats->stats[i] += l->stats[i];
3273 #endif
3274 stats->nr_slabs += l->nr_slabs;
3275 stats->nr_partial += l->nr_partial;
3276 stats->nr_inuse += (l->nr_slabs - l->nr_partial) * s->objects;
3278 list_for_each_entry(page, &l->partial, lru) {
3279 stats->nr_inuse += page->inuse;
3281 spin_unlock_irqrestore(&n->list_lock, flags);
3283 #endif
3285 stats->nr_objects = stats->nr_slabs * s->objects;
3288 #ifdef CONFIG_SLQB_SYSFS
3289 static void gather_stats(struct kmem_cache *s, struct stats_gather *stats)
3291 down_read(&slqb_lock); /* hold off hotplug */
3292 gather_stats_locked(s, stats);
3293 up_read(&slqb_lock);
3295 #endif
3296 #endif
3299 * The /proc/slabinfo ABI
3301 #ifdef CONFIG_SLABINFO
3302 #include <linux/proc_fs.h>
3303 #include <linux/seq_file.h>
3304 ssize_t slabinfo_write(struct file *file, const char __user * buffer,
3305 size_t count, loff_t *ppos)
3307 return -EINVAL;
3310 static void print_slabinfo_header(struct seq_file *m)
3312 seq_puts(m, "slabinfo - version: 2.1\n");
3313 seq_puts(m, "# name <active_objs> <num_objs> <objsize> "
3314 "<objperslab> <pagesperslab>");
3315 seq_puts(m, " : tunables <limit> <batchcount> <sharedfactor>");
3316 seq_puts(m, " : slabdata <active_slabs> <num_slabs> <sharedavail>");
3317 seq_putc(m, '\n');
3320 static void *s_start(struct seq_file *m, loff_t *pos)
3322 loff_t n = *pos;
3324 down_read(&slqb_lock);
3325 if (!n)
3326 print_slabinfo_header(m);
3328 return seq_list_start(&slab_caches, *pos);
3331 static void *s_next(struct seq_file *m, void *p, loff_t *pos)
3333 return seq_list_next(p, &slab_caches, pos);
3336 static void s_stop(struct seq_file *m, void *p)
3338 up_read(&slqb_lock);
3341 static int s_show(struct seq_file *m, void *p)
3343 struct stats_gather stats;
3344 struct kmem_cache *s;
3346 s = list_entry(p, struct kmem_cache, list);
3348 gather_stats_locked(s, &stats);
3350 seq_printf(m, "%-17s %6lu %6lu %6u %4u %4d", s->name, stats.nr_inuse,
3351 stats.nr_objects, s->size, s->objects, (1 << s->order));
3352 seq_printf(m, " : tunables %4u %4u %4u", slab_hiwater(s),
3353 slab_freebatch(s), 0);
3354 seq_printf(m, " : slabdata %6lu %6lu %6lu", stats.nr_slabs,
3355 stats.nr_slabs, 0UL);
3356 seq_putc(m, '\n');
3357 return 0;
3360 static const struct seq_operations slabinfo_op = {
3361 .start = s_start,
3362 .next = s_next,
3363 .stop = s_stop,
3364 .show = s_show,
3367 static int slabinfo_open(struct inode *inode, struct file *file)
3369 return seq_open(file, &slabinfo_op);
3372 static const struct file_operations proc_slabinfo_operations = {
3373 .open = slabinfo_open,
3374 .read = seq_read,
3375 .llseek = seq_lseek,
3376 .release = seq_release,
3379 static int __init slab_proc_init(void)
3381 proc_create("slabinfo", S_IWUSR|S_IRUGO, NULL,
3382 &proc_slabinfo_operations);
3383 return 0;
3385 module_init(slab_proc_init);
3386 #endif /* CONFIG_SLABINFO */
3388 #ifdef CONFIG_SLQB_SYSFS
3390 * sysfs API
3392 #define to_slab_attr(n) container_of(n, struct slab_attribute, attr)
3393 #define to_slab(n) container_of(n, struct kmem_cache, kobj);
3395 struct slab_attribute {
3396 struct attribute attr;
3397 ssize_t (*show)(struct kmem_cache *s, char *buf);
3398 ssize_t (*store)(struct kmem_cache *s, const char *x, size_t count);
3401 #define SLAB_ATTR_RO(_name) \
3402 static struct slab_attribute _name##_attr = __ATTR_RO(_name)
3404 #define SLAB_ATTR(_name) \
3405 static struct slab_attribute _name##_attr = \
3406 __ATTR(_name, 0644, _name##_show, _name##_store)
3408 static ssize_t slab_size_show(struct kmem_cache *s, char *buf)
3410 return sprintf(buf, "%d\n", s->size);
3412 SLAB_ATTR_RO(slab_size);
3414 static ssize_t align_show(struct kmem_cache *s, char *buf)
3416 return sprintf(buf, "%d\n", s->align);
3418 SLAB_ATTR_RO(align);
3420 static ssize_t object_size_show(struct kmem_cache *s, char *buf)
3422 return sprintf(buf, "%d\n", s->objsize);
3424 SLAB_ATTR_RO(object_size);
3426 static ssize_t objs_per_slab_show(struct kmem_cache *s, char *buf)
3428 return sprintf(buf, "%d\n", s->objects);
3430 SLAB_ATTR_RO(objs_per_slab);
3432 static ssize_t order_show(struct kmem_cache *s, char *buf)
3434 return sprintf(buf, "%d\n", s->order);
3436 SLAB_ATTR_RO(order);
3438 static ssize_t ctor_show(struct kmem_cache *s, char *buf)
3440 if (s->ctor) {
3441 int n = sprint_symbol(buf, (unsigned long)s->ctor);
3443 return n + sprintf(buf + n, "\n");
3445 return 0;
3447 SLAB_ATTR_RO(ctor);
3449 static ssize_t slabs_show(struct kmem_cache *s, char *buf)
3451 struct stats_gather stats;
3453 gather_stats(s, &stats);
3455 return sprintf(buf, "%lu\n", stats.nr_slabs);
3457 SLAB_ATTR_RO(slabs);
3459 static ssize_t objects_show(struct kmem_cache *s, char *buf)
3461 struct stats_gather stats;
3463 gather_stats(s, &stats);
3465 return sprintf(buf, "%lu\n", stats.nr_inuse);
3467 SLAB_ATTR_RO(objects);
3469 static ssize_t total_objects_show(struct kmem_cache *s, char *buf)
3471 struct stats_gather stats;
3473 gather_stats(s, &stats);
3475 return sprintf(buf, "%lu\n", stats.nr_objects);
3477 SLAB_ATTR_RO(total_objects);
3479 static ssize_t reclaim_account_show(struct kmem_cache *s, char *buf)
3481 return sprintf(buf, "%d\n", !!(s->flags & SLAB_RECLAIM_ACCOUNT));
3483 SLAB_ATTR_RO(reclaim_account);
3485 static ssize_t hwcache_align_show(struct kmem_cache *s, char *buf)
3487 return sprintf(buf, "%d\n", !!(s->flags & SLAB_HWCACHE_ALIGN));
3489 SLAB_ATTR_RO(hwcache_align);
3491 #ifdef CONFIG_ZONE_DMA
3492 static ssize_t cache_dma_show(struct kmem_cache *s, char *buf)
3494 return sprintf(buf, "%d\n", !!(s->flags & SLAB_CACHE_DMA));
3496 SLAB_ATTR_RO(cache_dma);
3497 #endif
3499 static ssize_t destroy_by_rcu_show(struct kmem_cache *s, char *buf)
3501 return sprintf(buf, "%d\n", !!(s->flags & SLAB_DESTROY_BY_RCU));
3503 SLAB_ATTR_RO(destroy_by_rcu);
3505 static ssize_t red_zone_show(struct kmem_cache *s, char *buf)
3507 return sprintf(buf, "%d\n", !!(s->flags & SLAB_RED_ZONE));
3509 SLAB_ATTR_RO(red_zone);
3511 static ssize_t poison_show(struct kmem_cache *s, char *buf)
3513 return sprintf(buf, "%d\n", !!(s->flags & SLAB_POISON));
3515 SLAB_ATTR_RO(poison);
3517 static ssize_t store_user_show(struct kmem_cache *s, char *buf)
3519 return sprintf(buf, "%d\n", !!(s->flags & SLAB_STORE_USER));
3521 SLAB_ATTR_RO(store_user);
3523 static ssize_t hiwater_store(struct kmem_cache *s,
3524 const char *buf, size_t length)
3526 long hiwater;
3527 int err;
3529 err = strict_strtol(buf, 10, &hiwater);
3530 if (err)
3531 return err;
3533 if (hiwater < 0)
3534 return -EINVAL;
3536 s->hiwater = hiwater;
3538 return length;
3541 static ssize_t hiwater_show(struct kmem_cache *s, char *buf)
3543 return sprintf(buf, "%d\n", slab_hiwater(s));
3545 SLAB_ATTR(hiwater);
3547 static ssize_t freebatch_store(struct kmem_cache *s,
3548 const char *buf, size_t length)
3550 long freebatch;
3551 int err;
3553 err = strict_strtol(buf, 10, &freebatch);
3554 if (err)
3555 return err;
3557 if (freebatch <= 0 || freebatch - 1 > s->hiwater)
3558 return -EINVAL;
3560 s->freebatch = freebatch;
3562 return length;
3565 static ssize_t freebatch_show(struct kmem_cache *s, char *buf)
3567 return sprintf(buf, "%d\n", slab_freebatch(s));
3569 SLAB_ATTR(freebatch);
3571 #ifdef CONFIG_SLQB_STATS
3572 static int show_stat(struct kmem_cache *s, char *buf, enum stat_item si)
3574 struct stats_gather stats;
3575 int len;
3576 #ifdef CONFIG_SMP
3577 int cpu;
3578 #endif
3580 gather_stats(s, &stats);
3582 len = sprintf(buf, "%lu", stats.stats[si]);
3584 #ifdef CONFIG_SMP
3585 for_each_online_cpu(cpu) {
3586 struct kmem_cache_cpu *c = get_cpu_slab(s, cpu);
3587 struct kmem_cache_list *l = &c->list;
3589 if (len < PAGE_SIZE - 20)
3590 len += sprintf(buf+len, " C%d=%lu", cpu, l->stats[si]);
3592 #endif
3593 return len + sprintf(buf + len, "\n");
3596 #define STAT_ATTR(si, text) \
3597 static ssize_t text##_show(struct kmem_cache *s, char *buf) \
3599 return show_stat(s, buf, si); \
3601 SLAB_ATTR_RO(text); \
3603 STAT_ATTR(ALLOC, alloc);
3604 STAT_ATTR(ALLOC_SLAB_FILL, alloc_slab_fill);
3605 STAT_ATTR(ALLOC_SLAB_NEW, alloc_slab_new);
3606 STAT_ATTR(FREE, free);
3607 STAT_ATTR(FREE_REMOTE, free_remote);
3608 STAT_ATTR(FLUSH_FREE_LIST, flush_free_list);
3609 STAT_ATTR(FLUSH_FREE_LIST_OBJECTS, flush_free_list_objects);
3610 STAT_ATTR(FLUSH_FREE_LIST_REMOTE, flush_free_list_remote);
3611 STAT_ATTR(FLUSH_SLAB_PARTIAL, flush_slab_partial);
3612 STAT_ATTR(FLUSH_SLAB_FREE, flush_slab_free);
3613 STAT_ATTR(FLUSH_RFREE_LIST, flush_rfree_list);
3614 STAT_ATTR(FLUSH_RFREE_LIST_OBJECTS, flush_rfree_list_objects);
3615 STAT_ATTR(CLAIM_REMOTE_LIST, claim_remote_list);
3616 STAT_ATTR(CLAIM_REMOTE_LIST_OBJECTS, claim_remote_list_objects);
3617 #endif
3619 static struct attribute *slab_attrs[] = {
3620 &slab_size_attr.attr,
3621 &object_size_attr.attr,
3622 &objs_per_slab_attr.attr,
3623 &order_attr.attr,
3624 &objects_attr.attr,
3625 &total_objects_attr.attr,
3626 &slabs_attr.attr,
3627 &ctor_attr.attr,
3628 &align_attr.attr,
3629 &hwcache_align_attr.attr,
3630 &reclaim_account_attr.attr,
3631 &destroy_by_rcu_attr.attr,
3632 &red_zone_attr.attr,
3633 &poison_attr.attr,
3634 &store_user_attr.attr,
3635 &hiwater_attr.attr,
3636 &freebatch_attr.attr,
3637 #ifdef CONFIG_ZONE_DMA
3638 &cache_dma_attr.attr,
3639 #endif
3640 #ifdef CONFIG_SLQB_STATS
3641 &alloc_attr.attr,
3642 &alloc_slab_fill_attr.attr,
3643 &alloc_slab_new_attr.attr,
3644 &free_attr.attr,
3645 &free_remote_attr.attr,
3646 &flush_free_list_attr.attr,
3647 &flush_free_list_objects_attr.attr,
3648 &flush_free_list_remote_attr.attr,
3649 &flush_slab_partial_attr.attr,
3650 &flush_slab_free_attr.attr,
3651 &flush_rfree_list_attr.attr,
3652 &flush_rfree_list_objects_attr.attr,
3653 &claim_remote_list_attr.attr,
3654 &claim_remote_list_objects_attr.attr,
3655 #endif
3656 NULL
3659 static struct attribute_group slab_attr_group = {
3660 .attrs = slab_attrs,
3663 static ssize_t slab_attr_show(struct kobject *kobj,
3664 struct attribute *attr, char *buf)
3666 struct slab_attribute *attribute;
3667 struct kmem_cache *s;
3668 int err;
3670 attribute = to_slab_attr(attr);
3671 s = to_slab(kobj);
3673 if (!attribute->show)
3674 return -EIO;
3676 err = attribute->show(s, buf);
3678 return err;
3681 static ssize_t slab_attr_store(struct kobject *kobj,
3682 struct attribute *attr, const char *buf, size_t len)
3684 struct slab_attribute *attribute;
3685 struct kmem_cache *s;
3686 int err;
3688 attribute = to_slab_attr(attr);
3689 s = to_slab(kobj);
3691 if (!attribute->store)
3692 return -EIO;
3694 err = attribute->store(s, buf, len);
3696 return err;
3699 static void kmem_cache_release(struct kobject *kobj)
3701 struct kmem_cache *s = to_slab(kobj);
3703 kmem_cache_free(&kmem_cache_cache, s);
3706 static struct sysfs_ops slab_sysfs_ops = {
3707 .show = slab_attr_show,
3708 .store = slab_attr_store,
3711 static struct kobj_type slab_ktype = {
3712 .sysfs_ops = &slab_sysfs_ops,
3713 .release = kmem_cache_release
3716 static int uevent_filter(struct kset *kset, struct kobject *kobj)
3718 struct kobj_type *ktype = get_ktype(kobj);
3720 if (ktype == &slab_ktype)
3721 return 1;
3722 return 0;
3725 static struct kset_uevent_ops slab_uevent_ops = {
3726 .filter = uevent_filter,
3729 static struct kset *slab_kset;
3731 static int sysfs_available __read_mostly;
3733 static int sysfs_slab_add(struct kmem_cache *s)
3735 int err;
3737 if (!sysfs_available)
3738 return 0;
3740 s->kobj.kset = slab_kset;
3741 err = kobject_init_and_add(&s->kobj, &slab_ktype, NULL, s->name);
3742 if (err) {
3743 kobject_put(&s->kobj);
3744 return err;
3747 err = sysfs_create_group(&s->kobj, &slab_attr_group);
3748 if (err)
3749 return err;
3751 kobject_uevent(&s->kobj, KOBJ_ADD);
3753 return 0;
3756 static void sysfs_slab_remove(struct kmem_cache *s)
3758 kobject_uevent(&s->kobj, KOBJ_REMOVE);
3759 kobject_del(&s->kobj);
3760 kobject_put(&s->kobj);
3763 static int __init slab_sysfs_init(void)
3765 struct kmem_cache *s;
3766 int err;
3768 slab_kset = kset_create_and_add("slab", &slab_uevent_ops, kernel_kobj);
3769 if (!slab_kset) {
3770 printk(KERN_ERR "Cannot register slab subsystem.\n");
3771 return -ENOSYS;
3774 down_write(&slqb_lock);
3776 sysfs_available = 1;
3778 list_for_each_entry(s, &slab_caches, list) {
3779 err = sysfs_slab_add(s);
3780 if (err)
3781 printk(KERN_ERR "SLQB: Unable to add boot slab %s"
3782 " to sysfs\n", s->name);
3785 up_write(&slqb_lock);
3787 return 0;
3789 device_initcall(slab_sysfs_init);
3791 #endif