ARM: cpu topology: Add debugfs interface for cpu_power
[cmplus.git] / mm / slqb.c
blobf4127ba7cc780e9177d0b827e6c55e7d6fa96de4
1 /*
2 * SLQB: A slab allocator that focuses on per-CPU scaling, and good performance
3 * with order-0 allocations. Fastpaths emphasis is placed on local allocaiton
4 * and freeing, but with a secondary goal of good remote freeing (freeing on
5 * another CPU from that which allocated).
7 * Using ideas and code from mm/slab.c, mm/slob.c, and mm/slub.c.
8 */
10 #include <linux/mm.h>
11 #include <linux/swap.h> /* struct reclaim_state */
12 #include <linux/module.h>
13 #include <linux/interrupt.h>
14 #include <linux/slab.h>
15 #include <linux/seq_file.h>
16 #include <linux/cpu.h>
17 #include <linux/cpuset.h>
18 #include <linux/mempolicy.h>
19 #include <linux/ctype.h>
20 #include <linux/kallsyms.h>
21 #include <linux/memory.h>
22 #include <linux/fault-inject.h>
25 * TODO
26 * - fix up releasing of offlined data structures. Not a big deal because
27 * they don't get cumulatively leaked with successive online/offline cycles
28 * - allow OOM conditions to flush back per-CPU pages to common lists to be
29 * reused by other CPUs.
30 * - investiage performance with memoryless nodes. Perhaps CPUs can be given
31 * a default closest home node via which it can use fastpath functions.
32 * Perhaps it is not a big problem.
36 * slqb_page overloads struct page, and is used to manage some slob allocation
37 * aspects, however to avoid the horrible mess in include/linux/mm_types.h,
38 * we'll just define our own struct slqb_page type variant here.
40 struct slqb_page {
41 union {
42 struct {
43 unsigned long flags; /* mandatory */
44 atomic_t _count; /* mandatory */
45 unsigned int inuse; /* Nr of objects */
46 struct kmem_cache_list *list; /* Pointer to list */
47 void **freelist; /* LIFO freelist */
48 union {
49 struct list_head lru; /* misc. list */
50 struct rcu_head rcu_head; /* for rcu freeing */
53 struct page page;
56 static inline void struct_slqb_page_wrong_size(void)
57 { BUILD_BUG_ON(sizeof(struct slqb_page) != sizeof(struct page)); }
59 #define PG_SLQB_BIT (1 << PG_slab)
62 * slqb_min_order: minimum allocation order for slabs
64 static int slqb_min_order;
67 * slqb_min_objects: minimum number of objects per slab. Increasing this
68 * will increase the allocation order for slabs with larger objects
70 static int slqb_min_objects = 1;
72 #ifdef CONFIG_NUMA
73 static inline int slab_numa(struct kmem_cache *s)
75 return s->flags & SLAB_NUMA;
77 #else
78 static inline int slab_numa(struct kmem_cache *s)
80 return 0;
82 #endif
84 static inline int slab_hiwater(struct kmem_cache *s)
86 return s->hiwater;
89 static inline int slab_freebatch(struct kmem_cache *s)
91 return s->freebatch;
95 * Lock order:
96 * kmem_cache_node->list_lock
97 * kmem_cache_remote_free->lock
99 * Data structures:
100 * SLQB is primarily per-cpu. For each kmem_cache, each CPU has:
102 * - A LIFO list of node-local objects. Allocation and freeing of node local
103 * objects goes first to this list.
105 * - 2 Lists of slab pages, free and partial pages. If an allocation misses
106 * the object list, it tries from the partial list, then the free list.
107 * After freeing an object to the object list, if it is over a watermark,
108 * some objects are freed back to pages. If an allocation misses these lists,
109 * a new slab page is allocated from the page allocator. If the free list
110 * reaches a watermark, some of its pages are returned to the page allocator.
112 * - A remote free queue, where objects freed that did not come from the local
113 * node are queued to. When this reaches a watermark, the objects are
114 * flushed.
116 * - A remotely freed queue, where objects allocated from this CPU are flushed
117 * to from other CPUs' remote free queues. kmem_cache_remote_free->lock is
118 * used to protect access to this queue.
120 * When the remotely freed queue reaches a watermark, a flag is set to tell
121 * the owner CPU to check it. The owner CPU will then check the queue on the
122 * next allocation that misses the object list. It will move all objects from
123 * this list onto the object list and then allocate one.
125 * This system of remote queueing is intended to reduce lock and remote
126 * cacheline acquisitions, and give a cooling off period for remotely freed
127 * objects before they are re-allocated.
129 * node specific allocations from somewhere other than the local node are
130 * handled by a per-node list which is the same as the above per-CPU data
131 * structures except for the following differences:
133 * - kmem_cache_node->list_lock is used to protect access for multiple CPUs to
134 * allocate from a given node.
136 * - There is no remote free queue. Nodes don't free objects, CPUs do.
139 static inline void slqb_stat_inc(struct kmem_cache_list *list,
140 enum stat_item si)
142 #ifdef CONFIG_SLQB_STATS
143 list->stats[si]++;
144 #endif
147 static inline void slqb_stat_add(struct kmem_cache_list *list,
148 enum stat_item si, unsigned long nr)
150 #ifdef CONFIG_SLQB_STATS
151 list->stats[si] += nr;
152 #endif
155 static inline int slqb_page_to_nid(struct slqb_page *page)
157 return page_to_nid(&page->page);
160 static inline void *slqb_page_address(struct slqb_page *page)
162 return page_address(&page->page);
165 static inline struct zone *slqb_page_zone(struct slqb_page *page)
167 return page_zone(&page->page);
170 static inline int virt_to_nid(const void *addr)
172 return page_to_nid(virt_to_page(addr));
175 static inline struct slqb_page *virt_to_head_slqb_page(const void *addr)
177 struct page *p;
179 p = virt_to_head_page(addr);
180 return (struct slqb_page *)p;
183 static inline void __free_slqb_pages(struct slqb_page *page, unsigned int order,
184 int pages)
186 struct page *p = &page->page;
188 reset_page_mapcount(p);
189 p->mapping = NULL;
190 VM_BUG_ON(!(p->flags & PG_SLQB_BIT));
191 p->flags &= ~PG_SLQB_BIT;
193 if (current->reclaim_state)
194 current->reclaim_state->reclaimed_slab += pages;
195 __free_pages(p, order);
198 #ifdef CONFIG_SLQB_DEBUG
199 static inline int slab_debug(struct kmem_cache *s)
201 return s->flags &
202 (SLAB_DEBUG_FREE |
203 SLAB_RED_ZONE |
204 SLAB_POISON |
205 SLAB_STORE_USER |
206 SLAB_TRACE);
208 static inline int slab_poison(struct kmem_cache *s)
210 return s->flags & SLAB_POISON;
212 #else
213 static inline int slab_debug(struct kmem_cache *s)
215 return 0;
217 static inline int slab_poison(struct kmem_cache *s)
219 return 0;
221 #endif
223 #define DEBUG_DEFAULT_FLAGS (SLAB_DEBUG_FREE | SLAB_RED_ZONE | \
224 SLAB_POISON | SLAB_STORE_USER)
226 /* Internal SLQB flags */
227 #define __OBJECT_POISON 0x80000000 /* Poison object */
229 /* Not all arches define cache_line_size */
230 #ifndef cache_line_size
231 #define cache_line_size() L1_CACHE_BYTES
232 #endif
234 #ifdef CONFIG_SMP
235 static struct notifier_block slab_notifier;
236 #endif
239 * slqb_lock protects slab_caches list and serialises hotplug operations.
240 * hotplug operations take lock for write, other operations can hold off
241 * hotplug by taking it for read (or write).
243 static DECLARE_RWSEM(slqb_lock);
246 * A list of all slab caches on the system
248 static LIST_HEAD(slab_caches);
251 * Tracking user of a slab.
253 struct track {
254 unsigned long addr; /* Called from address */
255 int cpu; /* Was running on cpu */
256 int pid; /* Pid context */
257 unsigned long when; /* When did the operation occur */
260 enum track_item { TRACK_ALLOC, TRACK_FREE };
262 static struct kmem_cache kmem_cache_cache;
264 #ifdef CONFIG_SLQB_SYSFS
265 static int sysfs_slab_add(struct kmem_cache *s);
266 static void sysfs_slab_remove(struct kmem_cache *s);
267 #else
268 static inline int sysfs_slab_add(struct kmem_cache *s)
270 return 0;
272 static inline void sysfs_slab_remove(struct kmem_cache *s)
274 kmem_cache_free(&kmem_cache_cache, s);
276 #endif
278 /********************************************************************
279 * Core slab cache functions
280 *******************************************************************/
282 static int __slab_is_available __read_mostly;
283 int slab_is_available(void)
285 return __slab_is_available;
288 static inline struct kmem_cache_cpu *get_cpu_slab(struct kmem_cache *s, int cpu)
290 #ifdef CONFIG_SMP
291 VM_BUG_ON(!s->cpu_slab[cpu]);
292 return s->cpu_slab[cpu];
293 #else
294 return &s->cpu_slab;
295 #endif
298 static inline int check_valid_pointer(struct kmem_cache *s,
299 struct slqb_page *page, const void *object)
301 void *base;
303 base = slqb_page_address(page);
304 if (object < base || object >= base + s->objects * s->size ||
305 (object - base) % s->size) {
306 return 0;
309 return 1;
312 static inline void *get_freepointer(struct kmem_cache *s, void *object)
314 return *(void **)(object + s->offset);
317 static inline void set_freepointer(struct kmem_cache *s, void *object, void *fp)
319 *(void **)(object + s->offset) = fp;
322 /* Loop over all objects in a slab */
323 #define for_each_object(__p, __s, __addr) \
324 for (__p = (__addr); __p < (__addr) + (__s)->objects * (__s)->size;\
325 __p += (__s)->size)
327 /* Scan freelist */
328 #define for_each_free_object(__p, __s, __free) \
329 for (__p = (__free); (__p) != NULL; __p = get_freepointer((__s),\
330 __p))
332 #ifdef CONFIG_SLQB_DEBUG
334 * Debug settings:
336 #ifdef CONFIG_SLQB_DEBUG_ON
337 static int slqb_debug __read_mostly = DEBUG_DEFAULT_FLAGS;
338 #else
339 static int slqb_debug __read_mostly;
340 #endif
342 static char *slqb_debug_slabs;
345 * Object debugging
347 static void print_section(char *text, u8 *addr, unsigned int length)
349 int i, offset;
350 int newline = 1;
351 char ascii[17];
353 ascii[16] = 0;
355 for (i = 0; i < length; i++) {
356 if (newline) {
357 printk(KERN_ERR "%8s 0x%p: ", text, addr + i);
358 newline = 0;
360 printk(KERN_CONT " %02x", addr[i]);
361 offset = i % 16;
362 ascii[offset] = isgraph(addr[i]) ? addr[i] : '.';
363 if (offset == 15) {
364 printk(KERN_CONT " %s\n", ascii);
365 newline = 1;
368 if (!newline) {
369 i %= 16;
370 while (i < 16) {
371 printk(KERN_CONT " ");
372 ascii[i] = ' ';
373 i++;
375 printk(KERN_CONT " %s\n", ascii);
379 static struct track *get_track(struct kmem_cache *s, void *object,
380 enum track_item alloc)
382 struct track *p;
384 if (s->offset)
385 p = object + s->offset + sizeof(void *);
386 else
387 p = object + s->inuse;
389 return p + alloc;
392 static void set_track(struct kmem_cache *s, void *object,
393 enum track_item alloc, unsigned long addr)
395 struct track *p;
397 if (s->offset)
398 p = object + s->offset + sizeof(void *);
399 else
400 p = object + s->inuse;
402 p += alloc;
403 if (addr) {
404 p->addr = addr;
405 p->cpu = raw_smp_processor_id();
406 p->pid = current ? current->pid : -1;
407 p->when = jiffies;
408 } else
409 memset(p, 0, sizeof(struct track));
412 static void init_tracking(struct kmem_cache *s, void *object)
414 if (!(s->flags & SLAB_STORE_USER))
415 return;
417 set_track(s, object, TRACK_FREE, 0UL);
418 set_track(s, object, TRACK_ALLOC, 0UL);
421 static void print_track(const char *s, struct track *t)
423 if (!t->addr)
424 return;
426 printk(KERN_ERR "INFO: %s in ", s);
427 __print_symbol("%s", (unsigned long)t->addr);
428 printk(" age=%lu cpu=%u pid=%d\n", jiffies - t->when, t->cpu, t->pid);
431 static void print_tracking(struct kmem_cache *s, void *object)
433 if (!(s->flags & SLAB_STORE_USER))
434 return;
436 print_track("Allocated", get_track(s, object, TRACK_ALLOC));
437 print_track("Freed", get_track(s, object, TRACK_FREE));
440 static void print_page_info(struct slqb_page *page)
442 printk(KERN_ERR "INFO: Slab 0x%p used=%u fp=0x%p flags=0x%04lx\n",
443 page, page->inuse, page->freelist, page->flags);
447 #define MAX_ERR_STR 100
448 static void slab_bug(struct kmem_cache *s, char *fmt, ...)
450 va_list args;
451 char buf[MAX_ERR_STR];
453 va_start(args, fmt);
454 vsnprintf(buf, sizeof(buf), fmt, args);
455 va_end(args);
456 printk(KERN_ERR "========================================"
457 "=====================================\n");
458 printk(KERN_ERR "BUG %s: %s\n", s->name, buf);
459 printk(KERN_ERR "----------------------------------------"
460 "-------------------------------------\n\n");
463 static void slab_fix(struct kmem_cache *s, char *fmt, ...)
465 va_list args;
466 char buf[100];
468 va_start(args, fmt);
469 vsnprintf(buf, sizeof(buf), fmt, args);
470 va_end(args);
471 printk(KERN_ERR "FIX %s: %s\n", s->name, buf);
474 static void print_trailer(struct kmem_cache *s, struct slqb_page *page, u8 *p)
476 unsigned int off; /* Offset of last byte */
477 u8 *addr = slqb_page_address(page);
479 print_tracking(s, p);
481 print_page_info(page);
483 printk(KERN_ERR "INFO: Object 0x%p @offset=%tu fp=0x%p\n\n",
484 p, p - addr, get_freepointer(s, p));
486 if (p > addr + 16)
487 print_section("Bytes b4", p - 16, 16);
489 print_section("Object", p, min(s->objsize, 128));
491 if (s->flags & SLAB_RED_ZONE)
492 print_section("Redzone", p + s->objsize, s->inuse - s->objsize);
494 if (s->offset)
495 off = s->offset + sizeof(void *);
496 else
497 off = s->inuse;
499 if (s->flags & SLAB_STORE_USER)
500 off += 2 * sizeof(struct track);
502 if (off != s->size) {
503 /* Beginning of the filler is the free pointer */
504 print_section("Padding", p + off, s->size - off);
507 dump_stack();
510 static void object_err(struct kmem_cache *s, struct slqb_page *page,
511 u8 *object, char *reason)
513 slab_bug(s, reason);
514 print_trailer(s, page, object);
517 static void slab_err(struct kmem_cache *s, struct slqb_page *page,
518 char *fmt, ...)
520 slab_bug(s, fmt);
521 print_page_info(page);
522 dump_stack();
525 static void init_object(struct kmem_cache *s, void *object, int active)
527 u8 *p = object;
529 if (s->flags & __OBJECT_POISON) {
530 memset(p, POISON_FREE, s->objsize - 1);
531 p[s->objsize - 1] = POISON_END;
534 if (s->flags & SLAB_RED_ZONE) {
535 memset(p + s->objsize,
536 active ? SLUB_RED_ACTIVE : SLUB_RED_INACTIVE,
537 s->inuse - s->objsize);
541 static u8 *check_bytes(u8 *start, unsigned int value, unsigned int bytes)
543 while (bytes) {
544 if (*start != (u8)value)
545 return start;
546 start++;
547 bytes--;
549 return NULL;
552 static void restore_bytes(struct kmem_cache *s, char *message, u8 data,
553 void *from, void *to)
555 slab_fix(s, "Restoring 0x%p-0x%p=0x%x\n", from, to - 1, data);
556 memset(from, data, to - from);
559 static int check_bytes_and_report(struct kmem_cache *s, struct slqb_page *page,
560 u8 *object, char *what,
561 u8 *start, unsigned int value, unsigned int bytes)
563 u8 *fault;
564 u8 *end;
566 fault = check_bytes(start, value, bytes);
567 if (!fault)
568 return 1;
570 end = start + bytes;
571 while (end > fault && end[-1] == value)
572 end--;
574 slab_bug(s, "%s overwritten", what);
575 printk(KERN_ERR "INFO: 0x%p-0x%p. First byte 0x%x instead of 0x%x\n",
576 fault, end - 1, fault[0], value);
577 print_trailer(s, page, object);
579 restore_bytes(s, what, value, fault, end);
580 return 0;
584 * Object layout:
586 * object address
587 * Bytes of the object to be managed.
588 * If the freepointer may overlay the object then the free
589 * pointer is the first word of the object.
591 * Poisoning uses 0x6b (POISON_FREE) and the last byte is
592 * 0xa5 (POISON_END)
594 * object + s->objsize
595 * Padding to reach word boundary. This is also used for Redzoning.
596 * Padding is extended by another word if Redzoning is enabled and
597 * objsize == inuse.
599 * We fill with 0xbb (RED_INACTIVE) for inactive objects and with
600 * 0xcc (RED_ACTIVE) for objects in use.
602 * object + s->inuse
603 * Meta data starts here.
605 * A. Free pointer (if we cannot overwrite object on free)
606 * B. Tracking data for SLAB_STORE_USER
607 * C. Padding to reach required alignment boundary or at mininum
608 * one word if debuggin is on to be able to detect writes
609 * before the word boundary.
611 * Padding is done using 0x5a (POISON_INUSE)
613 * object + s->size
614 * Nothing is used beyond s->size.
617 static int check_pad_bytes(struct kmem_cache *s, struct slqb_page *page, u8 *p)
619 unsigned long off = s->inuse; /* The end of info */
621 if (s->offset) {
622 /* Freepointer is placed after the object. */
623 off += sizeof(void *);
626 if (s->flags & SLAB_STORE_USER) {
627 /* We also have user information there */
628 off += 2 * sizeof(struct track);
631 if (s->size == off)
632 return 1;
634 return check_bytes_and_report(s, page, p, "Object padding",
635 p + off, POISON_INUSE, s->size - off);
638 static int slab_pad_check(struct kmem_cache *s, struct slqb_page *page)
640 u8 *start;
641 u8 *fault;
642 u8 *end;
643 int length;
644 int remainder;
646 if (!(s->flags & SLAB_POISON))
647 return 1;
649 start = slqb_page_address(page);
650 end = start + (PAGE_SIZE << s->order);
651 length = s->objects * s->size;
652 remainder = end - (start + length);
653 if (!remainder)
654 return 1;
656 fault = check_bytes(start + length, POISON_INUSE, remainder);
657 if (!fault)
658 return 1;
660 while (end > fault && end[-1] == POISON_INUSE)
661 end--;
663 slab_err(s, page, "Padding overwritten. 0x%p-0x%p", fault, end - 1);
664 print_section("Padding", start, length);
666 restore_bytes(s, "slab padding", POISON_INUSE, start, end);
667 return 0;
670 static int check_object(struct kmem_cache *s, struct slqb_page *page,
671 void *object, int active)
673 u8 *p = object;
674 u8 *endobject = object + s->objsize;
676 if (s->flags & SLAB_RED_ZONE) {
677 unsigned int red =
678 active ? SLUB_RED_ACTIVE : SLUB_RED_INACTIVE;
680 if (!check_bytes_and_report(s, page, object, "Redzone",
681 endobject, red, s->inuse - s->objsize))
682 return 0;
683 } else {
684 if ((s->flags & SLAB_POISON) && s->objsize < s->inuse) {
685 check_bytes_and_report(s, page, p, "Alignment padding",
686 endobject, POISON_INUSE, s->inuse - s->objsize);
690 if (s->flags & SLAB_POISON) {
691 if (!active && (s->flags & __OBJECT_POISON)) {
692 if (!check_bytes_and_report(s, page, p, "Poison", p,
693 POISON_FREE, s->objsize - 1))
694 return 0;
696 if (!check_bytes_and_report(s, page, p, "Poison",
697 p + s->objsize - 1, POISON_END, 1))
698 return 0;
702 * check_pad_bytes cleans up on its own.
704 check_pad_bytes(s, page, p);
707 return 1;
710 static int check_slab(struct kmem_cache *s, struct slqb_page *page)
712 if (!(page->flags & PG_SLQB_BIT)) {
713 slab_err(s, page, "Not a valid slab page");
714 return 0;
716 if (page->inuse == 0) {
717 slab_err(s, page, "inuse before free / after alloc", s->name);
718 return 0;
720 if (page->inuse > s->objects) {
721 slab_err(s, page, "inuse %u > max %u",
722 s->name, page->inuse, s->objects);
723 return 0;
725 /* Slab_pad_check fixes things up after itself */
726 slab_pad_check(s, page);
727 return 1;
730 static void trace(struct kmem_cache *s, struct slqb_page *page,
731 void *object, int alloc)
733 if (s->flags & SLAB_TRACE) {
734 printk(KERN_INFO "TRACE %s %s 0x%p inuse=%d fp=0x%p\n",
735 s->name,
736 alloc ? "alloc" : "free",
737 object, page->inuse,
738 page->freelist);
740 if (!alloc)
741 print_section("Object", (void *)object, s->objsize);
743 dump_stack();
747 static void setup_object_debug(struct kmem_cache *s, struct slqb_page *page,
748 void *object)
750 if (!slab_debug(s))
751 return;
753 if (!(s->flags & (SLAB_STORE_USER|SLAB_RED_ZONE|__OBJECT_POISON)))
754 return;
756 init_object(s, object, 0);
757 init_tracking(s, object);
760 static int alloc_debug_processing(struct kmem_cache *s,
761 void *object, unsigned long addr)
763 struct slqb_page *page;
764 page = virt_to_head_slqb_page(object);
766 if (!check_slab(s, page))
767 goto bad;
769 if (!check_valid_pointer(s, page, object)) {
770 object_err(s, page, object, "Freelist Pointer check fails");
771 goto bad;
774 if (object && !check_object(s, page, object, 0))
775 goto bad;
777 /* Success perform special debug activities for allocs */
778 if (s->flags & SLAB_STORE_USER)
779 set_track(s, object, TRACK_ALLOC, addr);
780 trace(s, page, object, 1);
781 init_object(s, object, 1);
782 return 1;
784 bad:
785 return 0;
788 static int free_debug_processing(struct kmem_cache *s,
789 void *object, unsigned long addr)
791 struct slqb_page *page;
792 page = virt_to_head_slqb_page(object);
794 if (!check_slab(s, page))
795 goto fail;
797 if (!check_valid_pointer(s, page, object)) {
798 slab_err(s, page, "Invalid object pointer 0x%p", object);
799 goto fail;
802 if (!check_object(s, page, object, 1))
803 return 0;
805 /* Special debug activities for freeing objects */
806 if (s->flags & SLAB_STORE_USER)
807 set_track(s, object, TRACK_FREE, addr);
808 trace(s, page, object, 0);
809 init_object(s, object, 0);
810 return 1;
812 fail:
813 slab_fix(s, "Object at 0x%p not freed", object);
814 return 0;
817 static int __init setup_slqb_debug(char *str)
819 slqb_debug = DEBUG_DEFAULT_FLAGS;
820 if (*str++ != '=' || !*str) {
822 * No options specified. Switch on full debugging.
824 goto out;
827 if (*str == ',') {
829 * No options but restriction on slabs. This means full
830 * debugging for slabs matching a pattern.
832 goto check_slabs;
835 slqb_debug = 0;
836 if (*str == '-') {
838 * Switch off all debugging measures.
840 goto out;
844 * Determine which debug features should be switched on
846 for (; *str && *str != ','; str++) {
847 switch (tolower(*str)) {
848 case 'f':
849 slqb_debug |= SLAB_DEBUG_FREE;
850 break;
851 case 'z':
852 slqb_debug |= SLAB_RED_ZONE;
853 break;
854 case 'p':
855 slqb_debug |= SLAB_POISON;
856 break;
857 case 'u':
858 slqb_debug |= SLAB_STORE_USER;
859 break;
860 case 't':
861 slqb_debug |= SLAB_TRACE;
862 break;
863 case 'a':
864 slqb_debug |= SLAB_FAILSLAB;
865 break;
866 default:
867 printk(KERN_ERR "slqb_debug option '%c' "
868 "unknown. skipped\n", *str);
872 check_slabs:
873 if (*str == ',')
874 slqb_debug_slabs = str + 1;
875 out:
876 return 1;
878 __setup("slqb_debug", setup_slqb_debug);
880 static int __init setup_slqb_min_order(char *str)
882 get_option(&str, &slqb_min_order);
883 slqb_min_order = min(slqb_min_order, MAX_ORDER - 1);
885 return 1;
887 __setup("slqb_min_order=", setup_slqb_min_order);
889 static int __init setup_slqb_min_objects(char *str)
891 get_option(&str, &slqb_min_objects);
893 return 1;
896 __setup("slqb_min_objects=", setup_slqb_min_objects);
898 static unsigned long kmem_cache_flags(unsigned long objsize,
899 unsigned long flags, const char *name,
900 void (*ctor)(void *))
903 * Enable debugging if selected on the kernel commandline.
905 if (slqb_debug && (!slqb_debug_slabs ||
906 strncmp(slqb_debug_slabs, name,
907 strlen(slqb_debug_slabs)) == 0))
908 flags |= slqb_debug;
910 if (num_possible_nodes() > 1)
911 flags |= SLAB_NUMA;
913 return flags;
915 #else
916 static inline void setup_object_debug(struct kmem_cache *s,
917 struct slqb_page *page, void *object)
921 static inline int alloc_debug_processing(struct kmem_cache *s,
922 void *object, unsigned long addr)
924 return 0;
927 static inline int free_debug_processing(struct kmem_cache *s,
928 void *object, unsigned long addr)
930 return 0;
933 static inline int slab_pad_check(struct kmem_cache *s, struct slqb_page *page)
935 return 1;
938 static inline int check_object(struct kmem_cache *s, struct slqb_page *page,
939 void *object, int active)
941 return 1;
944 static inline void add_full(struct kmem_cache_node *n, struct slqb_page *page)
948 static inline unsigned long kmem_cache_flags(unsigned long objsize,
949 unsigned long flags, const char *name, void (*ctor)(void *))
951 if (num_possible_nodes() > 1)
952 flags |= SLAB_NUMA;
953 return flags;
956 static const int slqb_debug;
957 #endif
960 * allocate a new slab (return its corresponding struct slqb_page)
962 static struct slqb_page *allocate_slab(struct kmem_cache *s,
963 gfp_t flags, int node)
965 struct slqb_page *page;
966 int pages = 1 << s->order;
968 flags |= s->allocflags;
970 page = (struct slqb_page *)alloc_pages_node(node, flags, s->order);
971 if (!page)
972 return NULL;
974 mod_zone_page_state(slqb_page_zone(page),
975 (s->flags & SLAB_RECLAIM_ACCOUNT) ?
976 NR_SLAB_RECLAIMABLE : NR_SLAB_UNRECLAIMABLE,
977 pages);
979 return page;
983 * Called once for each object on a new slab page
985 static void setup_object(struct kmem_cache *s,
986 struct slqb_page *page, void *object)
988 setup_object_debug(s, page, object);
989 if (unlikely(s->ctor))
990 s->ctor(object);
994 * Allocate a new slab, set up its object list.
996 static struct slqb_page *new_slab_page(struct kmem_cache *s,
997 gfp_t flags, int node, unsigned int colour)
999 struct slqb_page *page;
1000 void *start;
1001 void *last;
1002 void *p;
1004 BUG_ON(flags & GFP_SLAB_BUG_MASK);
1006 page = allocate_slab(s,
1007 flags & (GFP_RECLAIM_MASK | GFP_CONSTRAINT_MASK), node);
1008 if (!page)
1009 goto out;
1011 page->flags |= PG_SLQB_BIT;
1013 start = page_address(&page->page);
1015 if (unlikely(slab_poison(s)))
1016 memset(start, POISON_INUSE, PAGE_SIZE << s->order);
1018 start += colour;
1020 last = start;
1021 for_each_object(p, s, start) {
1022 setup_object(s, page, p);
1023 set_freepointer(s, last, p);
1024 last = p;
1026 set_freepointer(s, last, NULL);
1028 page->freelist = start;
1029 page->inuse = 0;
1030 out:
1031 return page;
1035 * Free a slab page back to the page allocator
1037 static void __free_slab(struct kmem_cache *s, struct slqb_page *page)
1039 int pages = 1 << s->order;
1041 if (unlikely(slab_debug(s))) {
1042 void *p;
1044 slab_pad_check(s, page);
1045 for_each_free_object(p, s, page->freelist)
1046 check_object(s, page, p, 0);
1049 mod_zone_page_state(slqb_page_zone(page),
1050 (s->flags & SLAB_RECLAIM_ACCOUNT) ?
1051 NR_SLAB_RECLAIMABLE : NR_SLAB_UNRECLAIMABLE,
1052 -pages);
1054 __free_slqb_pages(page, s->order, pages);
1057 static void rcu_free_slab(struct rcu_head *h)
1059 struct slqb_page *page;
1061 page = container_of(h, struct slqb_page, rcu_head);
1062 __free_slab(page->list->cache, page);
1065 static void free_slab(struct kmem_cache *s, struct slqb_page *page)
1067 VM_BUG_ON(page->inuse);
1068 if (unlikely(s->flags & SLAB_DESTROY_BY_RCU))
1069 call_rcu(&page->rcu_head, rcu_free_slab);
1070 else
1071 __free_slab(s, page);
1075 * Return an object to its slab.
1077 * Caller must be the owner CPU in the case of per-CPU list, or hold the node's
1078 * list_lock in the case of per-node list.
1080 static int free_object_to_page(struct kmem_cache *s,
1081 struct kmem_cache_list *l, struct slqb_page *page,
1082 void *object)
1084 VM_BUG_ON(page->list != l);
1086 set_freepointer(s, object, page->freelist);
1087 page->freelist = object;
1088 page->inuse--;
1090 if (!page->inuse) {
1091 if (likely(s->objects > 1)) {
1092 l->nr_partial--;
1093 list_del(&page->lru);
1095 l->nr_slabs--;
1096 free_slab(s, page);
1097 slqb_stat_inc(l, FLUSH_SLAB_FREE);
1098 return 1;
1100 } else if (page->inuse + 1 == s->objects) {
1101 l->nr_partial++;
1102 list_add(&page->lru, &l->partial);
1103 slqb_stat_inc(l, FLUSH_SLAB_PARTIAL);
1104 return 0;
1106 return 0;
1109 #ifdef CONFIG_SMP
1110 static void slab_free_to_remote(struct kmem_cache *s, struct slqb_page *page,
1111 void *object, struct kmem_cache_cpu *c);
1112 #endif
1115 * Flush the LIFO list of objects on a list. They are sent back to their pages
1116 * in case the pages also belong to the list, or to our CPU's remote-free list
1117 * in the case they do not.
1119 * Doesn't flush the entire list. flush_free_list_all does.
1121 * Caller must be the owner CPU in the case of per-CPU list, or hold the node's
1122 * list_lock in the case of per-node list.
1124 static void flush_free_list(struct kmem_cache *s, struct kmem_cache_list *l)
1126 void **head;
1127 int nr;
1128 int locked = 0;
1130 nr = l->freelist.nr;
1131 if (unlikely(!nr))
1132 return;
1134 nr = min(slab_freebatch(s), nr);
1136 slqb_stat_inc(l, FLUSH_FREE_LIST);
1137 slqb_stat_add(l, FLUSH_FREE_LIST_OBJECTS, nr);
1139 l->freelist.nr -= nr;
1140 head = l->freelist.head;
1142 do {
1143 struct slqb_page *page;
1144 void **object;
1146 object = head;
1147 VM_BUG_ON(!object);
1148 head = get_freepointer(s, object);
1149 page = virt_to_head_slqb_page(object);
1151 #ifdef CONFIG_SMP
1152 if (page->list != l) {
1153 struct kmem_cache_cpu *c;
1155 if (locked) {
1156 spin_unlock(&l->page_lock);
1157 locked = 0;
1160 c = get_cpu_slab(s, smp_processor_id());
1162 slab_free_to_remote(s, page, object, c);
1163 slqb_stat_inc(l, FLUSH_FREE_LIST_REMOTE);
1164 } else
1165 #endif
1167 if (!locked) {
1168 spin_lock(&l->page_lock);
1169 locked = 1;
1171 free_object_to_page(s, l, page, object);
1174 nr--;
1175 } while (nr);
1177 if (locked)
1178 spin_unlock(&l->page_lock);
1180 l->freelist.head = head;
1181 if (!l->freelist.nr)
1182 l->freelist.tail = NULL;
1185 static void flush_free_list_all(struct kmem_cache *s, struct kmem_cache_list *l)
1187 while (l->freelist.nr)
1188 flush_free_list(s, l);
1191 #ifdef CONFIG_SMP
1193 * If enough objects have been remotely freed back to this list,
1194 * remote_free_check will be set. In which case, we'll eventually come here
1195 * to take those objects off our remote_free list and onto our LIFO freelist.
1197 * Caller must be the owner CPU in the case of per-CPU list, or hold the node's
1198 * list_lock in the case of per-node list.
1200 static void claim_remote_free_list(struct kmem_cache *s,
1201 struct kmem_cache_list *l)
1203 void **head, **tail;
1204 int nr;
1206 if (!l->remote_free.list.nr)
1207 return;
1209 spin_lock(&l->remote_free.lock);
1211 l->remote_free_check = 0;
1212 head = l->remote_free.list.head;
1213 l->remote_free.list.head = NULL;
1214 tail = l->remote_free.list.tail;
1215 l->remote_free.list.tail = NULL;
1216 nr = l->remote_free.list.nr;
1217 l->remote_free.list.nr = 0;
1219 spin_unlock(&l->remote_free.lock);
1221 VM_BUG_ON(!nr);
1223 if (!l->freelist.nr) {
1224 /* Get head hot for likely subsequent allocation or flush */
1225 prefetchw(head);
1226 l->freelist.head = head;
1227 } else
1228 set_freepointer(s, l->freelist.tail, head);
1229 l->freelist.tail = tail;
1231 l->freelist.nr += nr;
1233 slqb_stat_inc(l, CLAIM_REMOTE_LIST);
1234 slqb_stat_add(l, CLAIM_REMOTE_LIST_OBJECTS, nr);
1236 #else
1237 static inline void claim_remote_free_list(struct kmem_cache *s,
1238 struct kmem_cache_list *l)
1241 #endif
1244 * Allocation fastpath. Get an object from the list's LIFO freelist, or
1245 * return NULL if it is empty.
1247 * Caller must be the owner CPU in the case of per-CPU list, or hold the node's
1248 * list_lock in the case of per-node list.
1250 static __always_inline void *__cache_list_get_object(struct kmem_cache *s,
1251 struct kmem_cache_list *l)
1253 void *object;
1255 object = l->freelist.head;
1256 if (likely(object)) {
1257 void *next = get_freepointer(s, object);
1259 VM_BUG_ON(!l->freelist.nr);
1260 l->freelist.nr--;
1261 l->freelist.head = next;
1263 return object;
1265 VM_BUG_ON(l->freelist.nr);
1267 #ifdef CONFIG_SMP
1268 if (unlikely(l->remote_free_check)) {
1269 claim_remote_free_list(s, l);
1271 if (l->freelist.nr > slab_hiwater(s))
1272 flush_free_list(s, l);
1274 /* repetition here helps gcc :( */
1275 object = l->freelist.head;
1276 if (likely(object)) {
1277 void *next = get_freepointer(s, object);
1279 VM_BUG_ON(!l->freelist.nr);
1280 l->freelist.nr--;
1281 l->freelist.head = next;
1283 return object;
1285 VM_BUG_ON(l->freelist.nr);
1287 #endif
1289 return NULL;
1293 * Slow(er) path. Get a page from this list's existing pages. Will be a
1294 * new empty page in the case that __slab_alloc_page has just been called
1295 * (empty pages otherwise never get queued up on the lists), or a partial page
1296 * already on the list.
1298 * Caller must be the owner CPU in the case of per-CPU list, or hold the node's
1299 * list_lock in the case of per-node list.
1301 static noinline void *__cache_list_get_page(struct kmem_cache *s,
1302 struct kmem_cache_list *l)
1304 struct slqb_page *page;
1305 void *object;
1307 if (unlikely(!l->nr_partial))
1308 return NULL;
1310 page = list_first_entry(&l->partial, struct slqb_page, lru);
1311 VM_BUG_ON(page->inuse == s->objects);
1312 if (page->inuse + 1 == s->objects) {
1313 l->nr_partial--;
1314 list_del(&page->lru);
1317 VM_BUG_ON(!page->freelist);
1319 page->inuse++;
1321 object = page->freelist;
1322 page->freelist = get_freepointer(s, object);
1323 if (page->freelist)
1324 prefetchw(page->freelist);
1325 VM_BUG_ON((page->inuse == s->objects) != (page->freelist == NULL));
1326 slqb_stat_inc(l, ALLOC_SLAB_FILL);
1328 return object;
1331 static void *cache_list_get_page(struct kmem_cache *s,
1332 struct kmem_cache_list *l)
1334 void *object;
1336 if (unlikely(!l->nr_partial))
1337 return NULL;
1339 spin_lock(&l->page_lock);
1340 object = __cache_list_get_page(s, l);
1341 spin_unlock(&l->page_lock);
1343 return object;
1347 * Allocation slowpath. Allocate a new slab page from the page allocator, and
1348 * put it on the list's partial list. Must be followed by an allocation so
1349 * that we don't have dangling empty pages on the partial list.
1351 * Returns 0 on allocation failure.
1353 * Must be called with interrupts disabled.
1355 static noinline void *__slab_alloc_page(struct kmem_cache *s,
1356 gfp_t gfpflags, int node)
1358 struct slqb_page *page;
1359 struct kmem_cache_list *l;
1360 struct kmem_cache_cpu *c;
1361 unsigned int colour;
1362 void *object;
1364 c = get_cpu_slab(s, smp_processor_id());
1365 colour = c->colour_next;
1366 c->colour_next += s->colour_off;
1367 if (c->colour_next >= s->colour_range)
1368 c->colour_next = 0;
1370 /* Caller handles __GFP_ZERO */
1371 gfpflags &= ~__GFP_ZERO;
1373 if (gfpflags & __GFP_WAIT)
1374 local_irq_enable();
1375 page = new_slab_page(s, gfpflags, node, colour);
1376 if (gfpflags & __GFP_WAIT)
1377 local_irq_disable();
1378 if (unlikely(!page))
1379 return page;
1381 if (!NUMA_BUILD || likely(slqb_page_to_nid(page) == numa_node_id())) {
1382 struct kmem_cache_cpu *c;
1383 int cpu = smp_processor_id();
1385 c = get_cpu_slab(s, cpu);
1386 l = &c->list;
1387 page->list = l;
1389 spin_lock(&l->page_lock);
1390 l->nr_slabs++;
1391 l->nr_partial++;
1392 list_add(&page->lru, &l->partial);
1393 slqb_stat_inc(l, ALLOC);
1394 slqb_stat_inc(l, ALLOC_SLAB_NEW);
1395 object = __cache_list_get_page(s, l);
1396 spin_unlock(&l->page_lock);
1397 } else {
1398 #ifdef CONFIG_NUMA
1399 struct kmem_cache_node *n;
1401 n = s->node_slab[slqb_page_to_nid(page)];
1402 l = &n->list;
1403 page->list = l;
1405 spin_lock(&n->list_lock);
1406 spin_lock(&l->page_lock);
1407 l->nr_slabs++;
1408 l->nr_partial++;
1409 list_add(&page->lru, &l->partial);
1410 slqb_stat_inc(l, ALLOC);
1411 slqb_stat_inc(l, ALLOC_SLAB_NEW);
1412 object = __cache_list_get_page(s, l);
1413 spin_unlock(&l->page_lock);
1414 spin_unlock(&n->list_lock);
1415 #endif
1417 VM_BUG_ON(!object);
1418 return object;
1421 #ifdef CONFIG_NUMA
1422 static noinline int alternate_nid(struct kmem_cache *s,
1423 gfp_t gfpflags, int node)
1425 if (in_interrupt() || (gfpflags & __GFP_THISNODE))
1426 return node;
1427 if (cpuset_do_slab_mem_spread() && (s->flags & SLAB_MEM_SPREAD))
1428 return cpuset_mem_spread_node();
1429 else if (current->mempolicy)
1430 return slab_node(current->mempolicy);
1431 return node;
1435 * Allocate an object from a remote node. Return NULL if none could be found
1436 * (in which case, caller should allocate a new slab)
1438 * Must be called with interrupts disabled.
1440 static void *__remote_slab_alloc_node(struct kmem_cache *s,
1441 gfp_t gfpflags, int node)
1443 struct kmem_cache_node *n;
1444 struct kmem_cache_list *l;
1445 void *object;
1447 n = s->node_slab[node];
1448 if (unlikely(!n)) /* node has no memory */
1449 return NULL;
1450 l = &n->list;
1452 spin_lock(&n->list_lock);
1454 object = __cache_list_get_object(s, l);
1455 if (unlikely(!object)) {
1456 object = cache_list_get_page(s, l);
1457 if (unlikely(!object)) {
1458 spin_unlock(&n->list_lock);
1459 return __slab_alloc_page(s, gfpflags, node);
1462 if (likely(object))
1463 slqb_stat_inc(l, ALLOC);
1464 spin_unlock(&n->list_lock);
1465 return object;
1468 static noinline void *__remote_slab_alloc(struct kmem_cache *s,
1469 gfp_t gfpflags, int node)
1471 void *object;
1472 struct zonelist *zonelist;
1473 struct zoneref *z;
1474 struct zone *zone;
1475 enum zone_type high_zoneidx = gfp_zone(gfpflags);
1477 object = __remote_slab_alloc_node(s, gfpflags, node);
1478 if (likely(object || (gfpflags & __GFP_THISNODE)))
1479 return object;
1481 zonelist = node_zonelist(slab_node(current->mempolicy), gfpflags);
1482 for_each_zone_zonelist(zone, z, zonelist, high_zoneidx) {
1483 if (!cpuset_zone_allowed_hardwall(zone, gfpflags))
1484 continue;
1486 node = zone_to_nid(zone);
1487 object = __remote_slab_alloc_node(s, gfpflags, node);
1488 if (likely(object))
1489 return object;
1491 return NULL;
1493 #endif
1496 * Main allocation path. Return an object, or NULL on allocation failure.
1498 * Must be called with interrupts disabled.
1500 static __always_inline void *__slab_alloc(struct kmem_cache *s,
1501 gfp_t gfpflags, int node)
1503 void *object;
1504 struct kmem_cache_cpu *c;
1505 struct kmem_cache_list *l;
1507 #ifdef CONFIG_NUMA
1508 if (unlikely(node != -1) && unlikely(node != numa_node_id())) {
1509 try_remote:
1510 return __remote_slab_alloc(s, gfpflags, node);
1512 #endif
1514 c = get_cpu_slab(s, smp_processor_id());
1515 VM_BUG_ON(!c);
1516 l = &c->list;
1517 object = __cache_list_get_object(s, l);
1518 if (unlikely(!object)) {
1519 #ifdef CONFIG_NUMA
1520 int thisnode = numa_node_id();
1523 * If the local node is memoryless, try remote alloc before
1524 * trying the page allocator. Otherwise, what happens is
1525 * objects are always freed to remote lists but the allocation
1526 * side always allocates a new page with only one object
1527 * used in each page
1529 if (unlikely(!node_state(thisnode, N_HIGH_MEMORY)))
1530 object = __remote_slab_alloc(s, gfpflags, thisnode);
1531 #endif
1533 if (!object) {
1534 object = cache_list_get_page(s, l);
1535 if (unlikely(!object)) {
1536 object = __slab_alloc_page(s, gfpflags, node);
1537 #ifdef CONFIG_NUMA
1538 if (unlikely(!object)) {
1539 node = numa_node_id();
1540 goto try_remote;
1542 #endif
1543 return object;
1547 if (likely(object))
1548 slqb_stat_inc(l, ALLOC);
1549 return object;
1553 * Perform some interrupts-on processing around the main allocation path
1554 * (debug checking and memset()ing).
1556 static __always_inline void *slab_alloc(struct kmem_cache *s,
1557 gfp_t gfpflags, int node, unsigned long addr)
1559 void *object;
1560 unsigned long flags;
1562 gfpflags &= gfp_allowed_mask;
1564 lockdep_trace_alloc(gfpflags);
1565 might_sleep_if(gfpflags & __GFP_WAIT);
1567 if (should_failslab(s->objsize, gfpflags, s->flags))
1568 return NULL;
1570 again:
1571 local_irq_save(flags);
1572 object = __slab_alloc(s, gfpflags, node);
1573 local_irq_restore(flags);
1575 if (unlikely(slab_debug(s)) && likely(object)) {
1576 if (unlikely(!alloc_debug_processing(s, object, addr)))
1577 goto again;
1580 if (unlikely(gfpflags & __GFP_ZERO) && likely(object))
1581 memset(object, 0, s->objsize);
1583 return object;
1586 static __always_inline void *__kmem_cache_alloc(struct kmem_cache *s,
1587 gfp_t gfpflags, unsigned long caller)
1589 int node = -1;
1591 #ifdef CONFIG_NUMA
1592 if (unlikely(current->flags & (PF_SPREAD_SLAB | PF_MEMPOLICY)))
1593 node = alternate_nid(s, gfpflags, node);
1594 #endif
1595 return slab_alloc(s, gfpflags, node, caller);
1598 void *kmem_cache_alloc(struct kmem_cache *s, gfp_t gfpflags)
1600 return __kmem_cache_alloc(s, gfpflags, _RET_IP_);
1602 EXPORT_SYMBOL(kmem_cache_alloc);
1604 #ifdef CONFIG_NUMA
1605 void *kmem_cache_alloc_node(struct kmem_cache *s, gfp_t gfpflags, int node)
1607 return slab_alloc(s, gfpflags, node, _RET_IP_);
1609 EXPORT_SYMBOL(kmem_cache_alloc_node);
1610 #endif
1612 #ifdef CONFIG_SMP
1614 * Flush this CPU's remote free list of objects back to the list from where
1615 * they originate. They end up on that list's remotely freed list, and
1616 * eventually we set it's remote_free_check if there are enough objects on it.
1618 * This seems convoluted, but it keeps is from stomping on the target CPU's
1619 * fastpath cachelines.
1621 * Must be called with interrupts disabled.
1623 static void flush_remote_free_cache(struct kmem_cache *s,
1624 struct kmem_cache_cpu *c)
1626 struct kmlist *src;
1627 struct kmem_cache_list *dst;
1628 unsigned int nr;
1629 int set;
1631 src = &c->rlist;
1632 nr = src->nr;
1633 if (unlikely(!nr))
1634 return;
1636 #ifdef CONFIG_SLQB_STATS
1638 struct kmem_cache_list *l = &c->list;
1640 slqb_stat_inc(l, FLUSH_RFREE_LIST);
1641 slqb_stat_add(l, FLUSH_RFREE_LIST_OBJECTS, nr);
1643 #endif
1645 dst = c->remote_cache_list;
1648 * Less common case, dst is filling up so free synchronously.
1649 * No point in having remote CPU free thse as it will just
1650 * free them back to the page list anyway.
1652 if (unlikely(dst->remote_free.list.nr > (slab_hiwater(s) >> 1))) {
1653 void **head;
1655 head = src->head;
1656 spin_lock(&dst->page_lock);
1657 do {
1658 struct slqb_page *page;
1659 void **object;
1661 object = head;
1662 VM_BUG_ON(!object);
1663 head = get_freepointer(s, object);
1664 page = virt_to_head_slqb_page(object);
1666 free_object_to_page(s, dst, page, object);
1667 nr--;
1668 } while (nr);
1669 spin_unlock(&dst->page_lock);
1671 src->head = NULL;
1672 src->tail = NULL;
1673 src->nr = 0;
1675 return;
1678 spin_lock(&dst->remote_free.lock);
1680 if (!dst->remote_free.list.head)
1681 dst->remote_free.list.head = src->head;
1682 else
1683 set_freepointer(s, dst->remote_free.list.tail, src->head);
1684 dst->remote_free.list.tail = src->tail;
1686 src->head = NULL;
1687 src->tail = NULL;
1688 src->nr = 0;
1690 if (dst->remote_free.list.nr < slab_freebatch(s))
1691 set = 1;
1692 else
1693 set = 0;
1695 dst->remote_free.list.nr += nr;
1697 if (unlikely(dst->remote_free.list.nr >= slab_freebatch(s) && set))
1698 dst->remote_free_check = 1;
1700 spin_unlock(&dst->remote_free.lock);
1704 * Free an object to this CPU's remote free list.
1706 * Must be called with interrupts disabled.
1708 static noinline void slab_free_to_remote(struct kmem_cache *s,
1709 struct slqb_page *page, void *object,
1710 struct kmem_cache_cpu *c)
1712 struct kmlist *r;
1715 * Our remote free list corresponds to a different list. Must
1716 * flush it and switch.
1718 if (page->list != c->remote_cache_list) {
1719 flush_remote_free_cache(s, c);
1720 c->remote_cache_list = page->list;
1723 r = &c->rlist;
1724 if (!r->head)
1725 r->head = object;
1726 else
1727 set_freepointer(s, r->tail, object);
1728 set_freepointer(s, object, NULL);
1729 r->tail = object;
1730 r->nr++;
1732 if (unlikely(r->nr >= slab_freebatch(s)))
1733 flush_remote_free_cache(s, c);
1735 #endif
1738 * Main freeing path. Return an object, or NULL on allocation failure.
1740 * Must be called with interrupts disabled.
1742 static __always_inline void __slab_free(struct kmem_cache *s,
1743 struct slqb_page *page, void *object)
1745 struct kmem_cache_cpu *c;
1746 struct kmem_cache_list *l;
1747 int thiscpu = smp_processor_id();
1749 c = get_cpu_slab(s, thiscpu);
1750 l = &c->list;
1752 slqb_stat_inc(l, FREE);
1754 if (!NUMA_BUILD || !slab_numa(s) ||
1755 likely(slqb_page_to_nid(page) == numa_node_id())) {
1757 * Freeing fastpath. Collects all local-node objects, not
1758 * just those allocated from our per-CPU list. This allows
1759 * fast transfer of objects from one CPU to another within
1760 * a given node.
1762 set_freepointer(s, object, l->freelist.head);
1763 l->freelist.head = object;
1764 if (!l->freelist.nr)
1765 l->freelist.tail = object;
1766 l->freelist.nr++;
1768 if (unlikely(l->freelist.nr > slab_hiwater(s)))
1769 flush_free_list(s, l);
1771 } else {
1772 #ifdef CONFIG_SMP
1774 * Freeing an object that was allocated on a remote node.
1776 slab_free_to_remote(s, page, object, c);
1777 slqb_stat_inc(l, FREE_REMOTE);
1778 #endif
1783 * Perform some interrupts-on processing around the main freeing path
1784 * (debug checking).
1786 static __always_inline void slab_free(struct kmem_cache *s,
1787 struct slqb_page *page, void *object)
1789 unsigned long flags;
1791 prefetchw(object);
1793 debug_check_no_locks_freed(object, s->objsize);
1794 if (likely(object) && unlikely(slab_debug(s))) {
1795 if (unlikely(!free_debug_processing(s, object, _RET_IP_)))
1796 return;
1799 local_irq_save(flags);
1800 __slab_free(s, page, object);
1801 local_irq_restore(flags);
1804 void kmem_cache_free(struct kmem_cache *s, void *object)
1806 struct slqb_page *page = NULL;
1808 if (slab_numa(s))
1809 page = virt_to_head_slqb_page(object);
1810 slab_free(s, page, object);
1812 EXPORT_SYMBOL(kmem_cache_free);
1815 * Calculate the order of allocation given an slab object size.
1817 * Order 0 allocations are preferred since order 0 does not cause fragmentation
1818 * in the page allocator, and they have fastpaths in the page allocator. But
1819 * also minimise external fragmentation with large objects.
1821 static int slab_order(int size, int max_order, int frac)
1823 int order;
1825 if (fls(size - 1) <= PAGE_SHIFT)
1826 order = 0;
1827 else
1828 order = fls(size - 1) - PAGE_SHIFT;
1829 if (order < slqb_min_order)
1830 order = slqb_min_order;
1832 while (order <= max_order) {
1833 unsigned long slab_size = PAGE_SIZE << order;
1834 unsigned long objects;
1835 unsigned long waste;
1837 objects = slab_size / size;
1838 if (!objects)
1839 goto next;
1841 if (order < MAX_ORDER && objects < slqb_min_objects) {
1843 * if we don't have enough objects for min_objects,
1844 * then try the next size up. Unless we have reached
1845 * our maximum possible page size.
1847 goto next;
1850 waste = slab_size - (objects * size);
1852 if (waste * frac <= slab_size)
1853 break;
1855 next:
1856 order++;
1859 return order;
1862 static int calculate_order(int size)
1864 int order;
1867 * Attempt to find best configuration for a slab. This
1868 * works by first attempting to generate a layout with
1869 * the best configuration and backing off gradually.
1871 order = slab_order(size, 1, 4);
1872 if (order <= 1)
1873 return order;
1876 * This size cannot fit in order-1. Allow bigger orders, but
1877 * forget about trying to save space.
1879 order = slab_order(size, MAX_ORDER - 1, 0);
1880 if (order < MAX_ORDER)
1881 return order;
1883 return -ENOSYS;
1887 * Figure out what the alignment of the objects will be.
1889 static unsigned long calculate_alignment(unsigned long flags,
1890 unsigned long align, unsigned long size)
1893 * If the user wants hardware cache aligned objects then follow that
1894 * suggestion if the object is sufficiently large.
1896 * The hardware cache alignment cannot override the specified
1897 * alignment though. If that is greater then use it.
1899 if (flags & SLAB_HWCACHE_ALIGN) {
1900 unsigned long ralign = cache_line_size();
1902 while (size <= ralign / 2)
1903 ralign /= 2;
1904 align = max(align, ralign);
1907 if (align < ARCH_SLAB_MINALIGN)
1908 align = ARCH_SLAB_MINALIGN;
1910 return ALIGN(align, sizeof(void *));
1913 static void init_kmem_cache_list(struct kmem_cache *s,
1914 struct kmem_cache_list *l)
1916 l->cache = s;
1917 l->freelist.nr = 0;
1918 l->freelist.head = NULL;
1919 l->freelist.tail = NULL;
1920 l->nr_partial = 0;
1921 l->nr_slabs = 0;
1922 INIT_LIST_HEAD(&l->partial);
1923 spin_lock_init(&l->page_lock);
1925 #ifdef CONFIG_SMP
1926 l->remote_free_check = 0;
1927 spin_lock_init(&l->remote_free.lock);
1928 l->remote_free.list.nr = 0;
1929 l->remote_free.list.head = NULL;
1930 l->remote_free.list.tail = NULL;
1931 #endif
1933 #ifdef CONFIG_SLQB_STATS
1934 memset(l->stats, 0, sizeof(l->stats));
1935 #endif
1938 static void init_kmem_cache_cpu(struct kmem_cache *s,
1939 struct kmem_cache_cpu *c)
1941 init_kmem_cache_list(s, &c->list);
1943 c->colour_next = 0;
1944 #ifdef CONFIG_SMP
1945 c->rlist.nr = 0;
1946 c->rlist.head = NULL;
1947 c->rlist.tail = NULL;
1948 c->remote_cache_list = NULL;
1949 #endif
1952 #ifdef CONFIG_NUMA
1953 static void init_kmem_cache_node(struct kmem_cache *s,
1954 struct kmem_cache_node *n)
1956 spin_lock_init(&n->list_lock);
1957 init_kmem_cache_list(s, &n->list);
1959 #endif
1961 /* Initial slabs. */
1962 #ifdef CONFIG_SMP
1963 static DEFINE_PER_CPU(struct kmem_cache_cpu, kmem_cache_cpus);
1964 #endif
1965 #ifdef CONFIG_NUMA
1966 /* XXX: really need a DEFINE_PER_NODE for per-node data because a static
1967 * array is wasteful */
1968 static struct kmem_cache_node kmem_cache_nodes[MAX_NUMNODES];
1969 #endif
1971 #ifdef CONFIG_SMP
1972 static struct kmem_cache kmem_cpu_cache;
1973 static DEFINE_PER_CPU(struct kmem_cache_cpu, kmem_cpu_cpus);
1974 #ifdef CONFIG_NUMA
1975 static struct kmem_cache_node kmem_cpu_nodes[MAX_NUMNODES]; /* XXX per-nid */
1976 #endif
1977 #endif
1979 #ifdef CONFIG_NUMA
1980 static struct kmem_cache kmem_node_cache;
1981 #ifdef CONFIG_SMP
1982 static DEFINE_PER_CPU(struct kmem_cache_cpu, kmem_node_cpus);
1983 #endif
1984 static struct kmem_cache_node kmem_node_nodes[MAX_NUMNODES]; /*XXX per-nid */
1985 #endif
1987 #ifdef CONFIG_SMP
1988 static struct kmem_cache_cpu *alloc_kmem_cache_cpu(struct kmem_cache *s,
1989 int cpu)
1991 struct kmem_cache_cpu *c;
1992 int node;
1994 node = cpu_to_node(cpu);
1996 c = kmem_cache_alloc_node(&kmem_cpu_cache, GFP_KERNEL, node);
1997 if (!c)
1998 return NULL;
2000 init_kmem_cache_cpu(s, c);
2001 return c;
2004 static void free_kmem_cache_cpus(struct kmem_cache *s)
2006 int cpu;
2008 for_each_online_cpu(cpu) {
2009 struct kmem_cache_cpu *c;
2011 c = s->cpu_slab[cpu];
2012 if (c) {
2013 kmem_cache_free(&kmem_cpu_cache, c);
2014 s->cpu_slab[cpu] = NULL;
2019 static int alloc_kmem_cache_cpus(struct kmem_cache *s)
2021 int cpu;
2023 for_each_online_cpu(cpu) {
2024 struct kmem_cache_cpu *c;
2026 c = s->cpu_slab[cpu];
2027 if (c)
2028 continue;
2030 c = alloc_kmem_cache_cpu(s, cpu);
2031 if (!c) {
2032 free_kmem_cache_cpus(s);
2033 return 0;
2035 s->cpu_slab[cpu] = c;
2037 return 1;
2040 #else
2041 static inline void free_kmem_cache_cpus(struct kmem_cache *s)
2045 static inline int alloc_kmem_cache_cpus(struct kmem_cache *s)
2047 init_kmem_cache_cpu(s, &s->cpu_slab);
2048 return 1;
2050 #endif
2052 #ifdef CONFIG_NUMA
2053 static void free_kmem_cache_nodes(struct kmem_cache *s)
2055 int node;
2057 for_each_node_state(node, N_NORMAL_MEMORY) {
2058 struct kmem_cache_node *n;
2060 n = s->node_slab[node];
2061 if (n) {
2062 kmem_cache_free(&kmem_node_cache, n);
2063 s->node_slab[node] = NULL;
2068 static int alloc_kmem_cache_nodes(struct kmem_cache *s)
2070 int node;
2072 for_each_node_state(node, N_NORMAL_MEMORY) {
2073 struct kmem_cache_node *n;
2075 n = kmem_cache_alloc_node(&kmem_node_cache, GFP_KERNEL, node);
2076 if (!n) {
2077 free_kmem_cache_nodes(s);
2078 return 0;
2080 init_kmem_cache_node(s, n);
2081 s->node_slab[node] = n;
2083 return 1;
2085 #else
2086 static void free_kmem_cache_nodes(struct kmem_cache *s)
2090 static int alloc_kmem_cache_nodes(struct kmem_cache *s)
2092 return 1;
2094 #endif
2097 * calculate_sizes() determines the order and the distribution of data within
2098 * a slab object.
2100 static int calculate_sizes(struct kmem_cache *s)
2102 unsigned long flags = s->flags;
2103 unsigned long size = s->objsize;
2104 unsigned long align = s->align;
2107 * Determine if we can poison the object itself. If the user of
2108 * the slab may touch the object after free or before allocation
2109 * then we should never poison the object itself.
2111 if (slab_poison(s) && !(flags & SLAB_DESTROY_BY_RCU) && !s->ctor)
2112 s->flags |= __OBJECT_POISON;
2113 else
2114 s->flags &= ~__OBJECT_POISON;
2117 * Round up object size to the next word boundary. We can only
2118 * place the free pointer at word boundaries and this determines
2119 * the possible location of the free pointer.
2121 size = ALIGN(size, sizeof(void *));
2123 #ifdef CONFIG_SLQB_DEBUG
2125 * If we are Redzoning then check if there is some space between the
2126 * end of the object and the free pointer. If not then add an
2127 * additional word to have some bytes to store Redzone information.
2129 if ((flags & SLAB_RED_ZONE) && size == s->objsize)
2130 size += sizeof(void *);
2131 #endif
2134 * With that we have determined the number of bytes in actual use
2135 * by the object. This is the potential offset to the free pointer.
2137 s->inuse = size;
2139 if (((flags & (SLAB_DESTROY_BY_RCU | SLAB_POISON)) || s->ctor)) {
2141 * Relocate free pointer after the object if it is not
2142 * permitted to overwrite the first word of the object on
2143 * kmem_cache_free.
2145 * This is the case if we do RCU, have a constructor or
2146 * destructor or are poisoning the objects.
2148 s->offset = size;
2149 size += sizeof(void *);
2152 #ifdef CONFIG_SLQB_DEBUG
2153 if (flags & SLAB_STORE_USER) {
2155 * Need to store information about allocs and frees after
2156 * the object.
2158 size += 2 * sizeof(struct track);
2161 if (flags & SLAB_RED_ZONE) {
2163 * Add some empty padding so that we can catch
2164 * overwrites from earlier objects rather than let
2165 * tracking information or the free pointer be
2166 * corrupted if an user writes before the start
2167 * of the object.
2169 size += sizeof(void *);
2171 #endif
2174 * Determine the alignment based on various parameters that the
2175 * user specified and the dynamic determination of cache line size
2176 * on bootup.
2178 align = calculate_alignment(flags, align, s->objsize);
2181 * SLQB stores one object immediately after another beginning from
2182 * offset 0. In order to align the objects we have to simply size
2183 * each object to conform to the alignment.
2185 size = ALIGN(size, align);
2186 s->size = size;
2187 s->order = calculate_order(size);
2189 if (s->order < 0)
2190 return 0;
2192 s->allocflags = 0;
2193 if (s->order)
2194 s->allocflags |= __GFP_COMP;
2196 if (s->flags & SLAB_CACHE_DMA)
2197 s->allocflags |= SLQB_DMA;
2199 if (s->flags & SLAB_RECLAIM_ACCOUNT)
2200 s->allocflags |= __GFP_RECLAIMABLE;
2203 * Determine the number of objects per slab
2205 s->objects = (PAGE_SIZE << s->order) / size;
2207 s->freebatch = max(4UL*PAGE_SIZE / size,
2208 min(256UL, 64*PAGE_SIZE / size));
2209 if (!s->freebatch)
2210 s->freebatch = 1;
2211 s->hiwater = s->freebatch << 2;
2213 return !!s->objects;
2217 #ifdef CONFIG_SMP
2219 * Per-cpu allocator can't be used because it always uses slab allocator,
2220 * and it can't do per-node allocations.
2222 static void *kmem_cache_dyn_array_alloc(int ids)
2224 size_t size = sizeof(void *) * ids;
2226 BUG_ON(!size);
2228 if (unlikely(!slab_is_available())) {
2229 static void *nextmem;
2230 static size_t nextleft;
2231 void *ret;
2234 * Special case for setting up initial caches. These will
2235 * never get freed by definition so we can do it rather
2236 * simply.
2238 if (size > nextleft) {
2239 nextmem = alloc_pages_exact(size, GFP_KERNEL);
2240 if (!nextmem)
2241 return NULL;
2242 nextleft = roundup(size, PAGE_SIZE);
2245 ret = nextmem;
2246 nextleft -= size;
2247 nextmem += size;
2248 memset(ret, 0, size);
2249 return ret;
2250 } else {
2251 return kzalloc(size, GFP_KERNEL);
2255 static void kmem_cache_dyn_array_free(void *array)
2257 if (unlikely(!slab_is_available()))
2258 return; /* error case without crashing here (will panic soon) */
2259 kfree(array);
2261 #endif
2264 * Except in early boot, this should be called with slqb_lock held for write
2265 * to lock out hotplug, and protect list modifications.
2267 static int kmem_cache_open(struct kmem_cache *s,
2268 const char *name, size_t size, size_t align,
2269 unsigned long flags, void (*ctor)(void *), int alloc)
2271 unsigned int left_over;
2273 memset(s, 0, sizeof(struct kmem_cache));
2274 s->name = name;
2275 s->ctor = ctor;
2276 s->objsize = size;
2277 s->align = align;
2278 s->flags = kmem_cache_flags(size, flags, name, ctor);
2280 if (!calculate_sizes(s))
2281 goto error;
2283 if (!slab_debug(s)) {
2284 left_over = (PAGE_SIZE << s->order) - (s->objects * s->size);
2285 s->colour_off = max(cache_line_size(), s->align);
2286 s->colour_range = left_over;
2287 } else {
2288 s->colour_off = 0;
2289 s->colour_range = 0;
2292 #ifdef CONFIG_SMP
2293 s->cpu_slab = kmem_cache_dyn_array_alloc(nr_cpu_ids);
2294 if (!s->cpu_slab)
2295 goto error;
2296 # ifdef CONFIG_NUMA
2297 s->node_slab = kmem_cache_dyn_array_alloc(nr_node_ids);
2298 if (!s->node_slab)
2299 goto error_cpu_array;
2300 # endif
2301 #endif
2303 if (likely(alloc)) {
2304 if (!alloc_kmem_cache_nodes(s))
2305 goto error_node_array;
2307 if (!alloc_kmem_cache_cpus(s))
2308 goto error_nodes;
2311 sysfs_slab_add(s);
2312 list_add(&s->list, &slab_caches);
2314 return 1;
2316 error_nodes:
2317 free_kmem_cache_nodes(s);
2318 error_node_array:
2319 #if defined(CONFIG_NUMA) && defined(CONFIG_SMP)
2320 kmem_cache_dyn_array_free(s->node_slab);
2321 error_cpu_array:
2322 #endif
2323 #ifdef CONFIG_SMP
2324 kmem_cache_dyn_array_free(s->cpu_slab);
2325 #endif
2326 error:
2327 if (flags & SLAB_PANIC)
2328 panic("%s: failed to create slab `%s'\n", __func__, name);
2329 return 0;
2333 * kmem_ptr_validate - check if an untrusted pointer might be a slab entry.
2334 * @s: the cache we're checking against
2335 * @ptr: pointer to validate
2337 * This verifies that the untrusted pointer looks sane;
2338 * it is _not_ a guarantee that the pointer is actually
2339 * part of the slab cache in question, but it at least
2340 * validates that the pointer can be dereferenced and
2341 * looks half-way sane.
2343 * Currently only used for dentry validation.
2345 int kmem_ptr_validate(struct kmem_cache *s, const void *ptr)
2347 unsigned long addr = (unsigned long)ptr;
2348 struct slqb_page *page;
2350 if (unlikely(addr < PAGE_OFFSET))
2351 goto out;
2352 if (unlikely(addr > (unsigned long)high_memory - s->size))
2353 goto out;
2354 if (unlikely(!IS_ALIGNED(addr, s->align)))
2355 goto out;
2356 if (unlikely(!kern_addr_valid(addr)))
2357 goto out;
2358 if (unlikely(!kern_addr_valid(addr + s->size - 1)))
2359 goto out;
2360 if (unlikely(!pfn_valid(addr >> PAGE_SHIFT)))
2361 goto out;
2362 page = virt_to_head_slqb_page(ptr);
2363 if (unlikely(!(page->flags & PG_SLQB_BIT)))
2364 goto out;
2365 if (unlikely(page->list->cache != s)) /* XXX: ouch, racy */
2366 goto out;
2367 return 1;
2368 out:
2369 return 0;
2371 EXPORT_SYMBOL(kmem_ptr_validate);
2374 * Determine the size of a slab object
2376 unsigned int kmem_cache_size(struct kmem_cache *s)
2378 return s->objsize;
2380 EXPORT_SYMBOL(kmem_cache_size);
2382 const char *kmem_cache_name(struct kmem_cache *s)
2384 return s->name;
2386 EXPORT_SYMBOL(kmem_cache_name);
2389 * Release all resources used by a slab cache. No more concurrency on the
2390 * slab, so we can touch remote kmem_cache_cpu structures.
2392 void kmem_cache_destroy(struct kmem_cache *s)
2394 #ifdef CONFIG_NUMA
2395 int node;
2396 #endif
2397 int cpu;
2399 down_write(&slqb_lock);
2400 list_del(&s->list);
2402 local_irq_disable();
2403 #ifdef CONFIG_SMP
2404 for_each_online_cpu(cpu) {
2405 struct kmem_cache_cpu *c = get_cpu_slab(s, cpu);
2406 struct kmem_cache_list *l = &c->list;
2408 flush_free_list_all(s, l);
2409 flush_remote_free_cache(s, c);
2411 #endif
2413 for_each_online_cpu(cpu) {
2414 struct kmem_cache_cpu *c = get_cpu_slab(s, cpu);
2415 struct kmem_cache_list *l = &c->list;
2417 claim_remote_free_list(s, l);
2418 flush_free_list_all(s, l);
2420 WARN_ON(l->freelist.nr);
2421 WARN_ON(l->nr_slabs);
2422 WARN_ON(l->nr_partial);
2425 free_kmem_cache_cpus(s);
2427 #ifdef CONFIG_NUMA
2428 for_each_node_state(node, N_NORMAL_MEMORY) {
2429 struct kmem_cache_node *n;
2430 struct kmem_cache_list *l;
2432 n = s->node_slab[node];
2433 if (!n)
2434 continue;
2435 l = &n->list;
2437 claim_remote_free_list(s, l);
2438 flush_free_list_all(s, l);
2440 WARN_ON(l->freelist.nr);
2441 WARN_ON(l->nr_slabs);
2442 WARN_ON(l->nr_partial);
2445 free_kmem_cache_nodes(s);
2446 #endif
2447 local_irq_enable();
2449 sysfs_slab_remove(s);
2450 up_write(&slqb_lock);
2452 EXPORT_SYMBOL(kmem_cache_destroy);
2454 /********************************************************************
2455 * Kmalloc subsystem
2456 *******************************************************************/
2458 struct kmem_cache kmalloc_caches[KMALLOC_SHIFT_SLQB_HIGH + 1] __cacheline_aligned;
2459 EXPORT_SYMBOL(kmalloc_caches);
2461 #ifdef CONFIG_ZONE_DMA
2462 struct kmem_cache kmalloc_caches_dma[KMALLOC_SHIFT_SLQB_HIGH + 1] __cacheline_aligned;
2463 EXPORT_SYMBOL(kmalloc_caches_dma);
2464 #endif
2466 #ifndef ARCH_KMALLOC_FLAGS
2467 #define ARCH_KMALLOC_FLAGS SLAB_HWCACHE_ALIGN
2468 #endif
2470 static struct kmem_cache *open_kmalloc_cache(struct kmem_cache *s,
2471 const char *name, int size, gfp_t gfp_flags)
2473 unsigned int flags = ARCH_KMALLOC_FLAGS | SLAB_PANIC;
2475 if (gfp_flags & SLQB_DMA)
2476 flags |= SLAB_CACHE_DMA;
2478 kmem_cache_open(s, name, size, ARCH_KMALLOC_MINALIGN, flags, NULL, 1);
2480 return s;
2484 * Conversion table for small slabs sizes / 8 to the index in the
2485 * kmalloc array. This is necessary for slabs < 192 since we have non power
2486 * of two cache sizes there. The size of larger slabs can be determined using
2487 * fls.
2489 static s8 size_index[24] __cacheline_aligned = {
2490 3, /* 8 */
2491 4, /* 16 */
2492 5, /* 24 */
2493 5, /* 32 */
2494 6, /* 40 */
2495 6, /* 48 */
2496 6, /* 56 */
2497 6, /* 64 */
2498 #if L1_CACHE_BYTES < 64
2499 1, /* 72 */
2500 1, /* 80 */
2501 1, /* 88 */
2502 1, /* 96 */
2503 #else
2508 #endif
2509 7, /* 104 */
2510 7, /* 112 */
2511 7, /* 120 */
2512 7, /* 128 */
2513 #if L1_CACHE_BYTES < 128
2514 2, /* 136 */
2515 2, /* 144 */
2516 2, /* 152 */
2517 2, /* 160 */
2518 2, /* 168 */
2519 2, /* 176 */
2520 2, /* 184 */
2521 2 /* 192 */
2522 #else
2531 #endif
2534 static struct kmem_cache *get_slab(size_t size, gfp_t flags)
2536 int index;
2538 if (unlikely(size <= KMALLOC_MIN_SIZE)) {
2539 if (unlikely(!size))
2540 return ZERO_SIZE_PTR;
2542 index = KMALLOC_SHIFT_LOW;
2543 goto got_index;
2546 #if L1_CACHE_BYTES >= 128
2547 if (size <= 128) {
2548 #else
2549 if (size <= 192) {
2550 #endif
2551 index = size_index[(size - 1) / 8];
2552 } else {
2553 if (unlikely(size > 1UL << KMALLOC_SHIFT_SLQB_HIGH))
2554 return NULL;
2556 index = fls(size - 1);
2559 got_index:
2560 if (unlikely((flags & SLQB_DMA)))
2561 return &kmalloc_caches_dma[index];
2562 else
2563 return &kmalloc_caches[index];
2566 void *__kmalloc(size_t size, gfp_t flags)
2568 struct kmem_cache *s;
2570 s = get_slab(size, flags);
2571 if (unlikely(ZERO_OR_NULL_PTR(s)))
2572 return s;
2574 return __kmem_cache_alloc(s, flags, _RET_IP_);
2576 EXPORT_SYMBOL(__kmalloc);
2578 #ifdef CONFIG_NUMA
2579 void *__kmalloc_node(size_t size, gfp_t flags, int node)
2581 struct kmem_cache *s;
2583 s = get_slab(size, flags);
2584 if (unlikely(ZERO_OR_NULL_PTR(s)))
2585 return s;
2587 return kmem_cache_alloc_node(s, flags, node);
2589 EXPORT_SYMBOL(__kmalloc_node);
2590 #endif
2592 size_t ksize(const void *object)
2594 struct slqb_page *page;
2595 struct kmem_cache *s;
2597 BUG_ON(!object);
2598 if (unlikely(object == ZERO_SIZE_PTR))
2599 return 0;
2601 page = virt_to_head_slqb_page(object);
2602 BUG_ON(!(page->flags & PG_SLQB_BIT));
2604 s = page->list->cache;
2607 * Debugging requires use of the padding between object
2608 * and whatever may come after it.
2610 if (s->flags & (SLAB_RED_ZONE | SLAB_POISON))
2611 return s->objsize;
2614 * If we have the need to store the freelist pointer
2615 * back there or track user information then we can
2616 * only use the space before that information.
2618 if (s->flags & (SLAB_DESTROY_BY_RCU | SLAB_STORE_USER))
2619 return s->inuse;
2622 * Else we can use all the padding etc for the allocation
2624 return s->size;
2626 EXPORT_SYMBOL(ksize);
2628 void kfree(const void *object)
2630 struct kmem_cache *s;
2631 struct slqb_page *page;
2633 if (unlikely(ZERO_OR_NULL_PTR(object)))
2634 return;
2636 page = virt_to_head_slqb_page(object);
2637 s = page->list->cache;
2639 slab_free(s, page, (void *)object);
2641 EXPORT_SYMBOL(kfree);
2643 static void kmem_cache_trim_percpu(void *arg)
2645 int cpu = smp_processor_id();
2646 struct kmem_cache *s = arg;
2647 struct kmem_cache_cpu *c = get_cpu_slab(s, cpu);
2648 struct kmem_cache_list *l = &c->list;
2650 claim_remote_free_list(s, l);
2651 flush_free_list(s, l);
2652 #ifdef CONFIG_SMP
2653 flush_remote_free_cache(s, c);
2654 #endif
2657 int kmem_cache_shrink(struct kmem_cache *s)
2659 #ifdef CONFIG_NUMA
2660 int node;
2661 #endif
2663 on_each_cpu(kmem_cache_trim_percpu, s, 1);
2665 #ifdef CONFIG_NUMA
2666 for_each_node_state(node, N_NORMAL_MEMORY) {
2667 struct kmem_cache_node *n;
2668 struct kmem_cache_list *l;
2670 n = s->node_slab[node];
2671 if (!n)
2672 continue;
2673 l = &n->list;
2675 spin_lock_irq(&n->list_lock);
2676 claim_remote_free_list(s, l);
2677 flush_free_list(s, l);
2678 spin_unlock_irq(&n->list_lock);
2680 #endif
2682 return 0;
2684 EXPORT_SYMBOL(kmem_cache_shrink);
2686 #if defined(CONFIG_NUMA) && defined(CONFIG_MEMORY_HOTPLUG)
2687 static void kmem_cache_reap_percpu(void *arg)
2689 int cpu = smp_processor_id();
2690 struct kmem_cache *s;
2691 long phase = (long)arg;
2693 list_for_each_entry(s, &slab_caches, list) {
2694 struct kmem_cache_cpu *c = get_cpu_slab(s, cpu);
2695 struct kmem_cache_list *l = &c->list;
2697 if (phase == 0) {
2698 flush_free_list_all(s, l);
2699 flush_remote_free_cache(s, c);
2702 if (phase == 1) {
2703 claim_remote_free_list(s, l);
2704 flush_free_list_all(s, l);
2709 static void kmem_cache_reap(void)
2711 struct kmem_cache *s;
2712 int node;
2714 down_read(&slqb_lock);
2715 on_each_cpu(kmem_cache_reap_percpu, (void *)0, 1);
2716 on_each_cpu(kmem_cache_reap_percpu, (void *)1, 1);
2718 list_for_each_entry(s, &slab_caches, list) {
2719 for_each_node_state(node, N_NORMAL_MEMORY) {
2720 struct kmem_cache_node *n;
2721 struct kmem_cache_list *l;
2723 n = s->node_slab[node];
2724 if (!n)
2725 continue;
2726 l = &n->list;
2728 spin_lock_irq(&n->list_lock);
2729 claim_remote_free_list(s, l);
2730 flush_free_list_all(s, l);
2731 spin_unlock_irq(&n->list_lock);
2734 up_read(&slqb_lock);
2736 #endif
2738 static void cache_trim_worker(struct work_struct *w)
2740 struct delayed_work *work =
2741 container_of(w, struct delayed_work, work);
2742 struct kmem_cache *s;
2744 if (!down_read_trylock(&slqb_lock))
2745 goto out;
2747 list_for_each_entry(s, &slab_caches, list) {
2748 #ifdef CONFIG_NUMA
2749 int node = numa_node_id();
2750 struct kmem_cache_node *n = s->node_slab[node];
2752 if (n) {
2753 struct kmem_cache_list *l = &n->list;
2755 spin_lock_irq(&n->list_lock);
2756 claim_remote_free_list(s, l);
2757 flush_free_list(s, l);
2758 spin_unlock_irq(&n->list_lock);
2760 #endif
2762 local_irq_disable();
2763 kmem_cache_trim_percpu(s);
2764 local_irq_enable();
2767 up_read(&slqb_lock);
2768 out:
2769 schedule_delayed_work(work, round_jiffies_relative(3*HZ));
2772 static DEFINE_PER_CPU(struct delayed_work, slqb_cache_trim_work);
2774 static void __cpuinit start_cpu_timer(int cpu)
2776 struct delayed_work *cache_trim_work = &per_cpu(slqb_cache_trim_work,
2777 cpu);
2780 * When this gets called from do_initcalls via cpucache_init(),
2781 * init_workqueues() has already run, so keventd will be setup
2782 * at that time.
2784 if (keventd_up() && cache_trim_work->work.func == NULL) {
2785 INIT_DELAYED_WORK(cache_trim_work, cache_trim_worker);
2786 schedule_delayed_work_on(cpu, cache_trim_work,
2787 __round_jiffies_relative(HZ, cpu));
2791 static int __init cpucache_init(void)
2793 int cpu;
2795 for_each_online_cpu(cpu)
2796 start_cpu_timer(cpu);
2798 return 0;
2800 device_initcall(cpucache_init);
2802 #if defined(CONFIG_NUMA) && defined(CONFIG_MEMORY_HOTPLUG)
2803 static void slab_mem_going_offline_callback(void *arg)
2805 kmem_cache_reap();
2808 static void slab_mem_offline_callback(void *arg)
2810 /* XXX: should release structures, see CPU offline comment */
2813 static int slab_mem_going_online_callback(void *arg)
2815 struct kmem_cache *s;
2816 struct kmem_cache_node *n;
2817 struct memory_notify *marg = arg;
2818 int nid = marg->status_change_nid;
2819 int ret = 0;
2822 * If the node's memory is already available, then kmem_cache_node is
2823 * already created. Nothing to do.
2825 if (nid < 0)
2826 return 0;
2829 * We are bringing a node online. No memory is availabe yet. We must
2830 * allocate a kmem_cache_node structure in order to bring the node
2831 * online.
2833 down_write(&slqb_lock);
2834 list_for_each_entry(s, &slab_caches, list) {
2836 * XXX: kmem_cache_alloc_node will fallback to other nodes
2837 * since memory is not yet available from the node that
2838 * is brought up.
2840 if (s->node_slab[nid]) /* could be lefover from last online */
2841 continue;
2842 n = kmem_cache_alloc(&kmem_node_cache, GFP_KERNEL);
2843 if (!n) {
2844 ret = -ENOMEM;
2845 goto out;
2847 init_kmem_cache_node(s, n);
2848 s->node_slab[nid] = n;
2850 out:
2851 up_write(&slqb_lock);
2852 return ret;
2855 static int slab_memory_callback(struct notifier_block *self,
2856 unsigned long action, void *arg)
2858 int ret = 0;
2860 switch (action) {
2861 case MEM_GOING_ONLINE:
2862 ret = slab_mem_going_online_callback(arg);
2863 break;
2864 case MEM_GOING_OFFLINE:
2865 slab_mem_going_offline_callback(arg);
2866 break;
2867 case MEM_OFFLINE:
2868 case MEM_CANCEL_ONLINE:
2869 slab_mem_offline_callback(arg);
2870 break;
2871 case MEM_ONLINE:
2872 case MEM_CANCEL_OFFLINE:
2873 break;
2876 if (ret)
2877 ret = notifier_from_errno(ret);
2878 else
2879 ret = NOTIFY_OK;
2880 return ret;
2883 #endif /* CONFIG_MEMORY_HOTPLUG */
2885 /********************************************************************
2886 * Basic setup of slabs
2887 *******************************************************************/
2889 void __init kmem_cache_init(void)
2891 int i;
2892 unsigned int flags = SLAB_HWCACHE_ALIGN|SLAB_PANIC;
2895 * All the ifdefs are rather ugly here, but it's just the setup code,
2896 * so it doesn't have to be too readable :)
2900 * No need to take slqb_lock here: there should be no concurrency
2901 * anyway, and spin_unlock_irq in rwsem code could enable interrupts
2902 * too early.
2904 kmem_cache_open(&kmem_cache_cache, "kmem_cache",
2905 sizeof(struct kmem_cache), 0, flags, NULL, 0);
2906 #ifdef CONFIG_SMP
2907 kmem_cache_open(&kmem_cpu_cache, "kmem_cache_cpu",
2908 sizeof(struct kmem_cache_cpu), 0, flags, NULL, 0);
2909 #endif
2910 #ifdef CONFIG_NUMA
2911 kmem_cache_open(&kmem_node_cache, "kmem_cache_node",
2912 sizeof(struct kmem_cache_node), 0, flags, NULL, 0);
2913 #endif
2915 #ifdef CONFIG_SMP
2916 for_each_possible_cpu(i) {
2917 struct kmem_cache_cpu *c;
2919 c = &per_cpu(kmem_cache_cpus, i);
2920 init_kmem_cache_cpu(&kmem_cache_cache, c);
2921 kmem_cache_cache.cpu_slab[i] = c;
2923 c = &per_cpu(kmem_cpu_cpus, i);
2924 init_kmem_cache_cpu(&kmem_cpu_cache, c);
2925 kmem_cpu_cache.cpu_slab[i] = c;
2927 #ifdef CONFIG_NUMA
2928 c = &per_cpu(kmem_node_cpus, i);
2929 init_kmem_cache_cpu(&kmem_node_cache, c);
2930 kmem_node_cache.cpu_slab[i] = c;
2931 #endif
2933 #else
2934 init_kmem_cache_cpu(&kmem_cache_cache, &kmem_cache_cache.cpu_slab);
2935 #endif
2937 #ifdef CONFIG_NUMA
2938 for_each_node_state(i, N_NORMAL_MEMORY) {
2939 struct kmem_cache_node *n;
2941 n = &kmem_cache_nodes[i];
2942 init_kmem_cache_node(&kmem_cache_cache, n);
2943 kmem_cache_cache.node_slab[i] = n;
2944 #ifdef CONFIG_SMP
2945 n = &kmem_cpu_nodes[i];
2946 init_kmem_cache_node(&kmem_cpu_cache, n);
2947 kmem_cpu_cache.node_slab[i] = n;
2948 #endif
2949 n = &kmem_node_nodes[i];
2950 init_kmem_cache_node(&kmem_node_cache, n);
2951 kmem_node_cache.node_slab[i] = n;
2953 #endif
2955 /* Caches that are not of the two-to-the-power-of size */
2956 if (L1_CACHE_BYTES < 64 && KMALLOC_MIN_SIZE <= 64) {
2957 open_kmalloc_cache(&kmalloc_caches[1],
2958 "kmalloc-96", 96, GFP_KERNEL);
2959 #ifdef CONFIG_ZONE_DMA
2960 open_kmalloc_cache(&kmalloc_caches_dma[1],
2961 "kmalloc_dma-96", 96, GFP_KERNEL|SLQB_DMA);
2962 #endif
2964 if (L1_CACHE_BYTES < 128 && KMALLOC_MIN_SIZE <= 128) {
2965 open_kmalloc_cache(&kmalloc_caches[2],
2966 "kmalloc-192", 192, GFP_KERNEL);
2967 #ifdef CONFIG_ZONE_DMA
2968 open_kmalloc_cache(&kmalloc_caches_dma[2],
2969 "kmalloc_dma-192", 192, GFP_KERNEL|SLQB_DMA);
2970 #endif
2973 for (i = KMALLOC_SHIFT_LOW; i <= KMALLOC_SHIFT_SLQB_HIGH; i++) {
2974 open_kmalloc_cache(&kmalloc_caches[i],
2975 "kmalloc", 1 << i, GFP_KERNEL);
2976 #ifdef CONFIG_ZONE_DMA
2977 open_kmalloc_cache(&kmalloc_caches_dma[i],
2978 "kmalloc_dma", 1 << i, GFP_KERNEL|SLQB_DMA);
2979 #endif
2983 * Patch up the size_index table if we have strange large alignment
2984 * requirements for the kmalloc array. This is only the case for
2985 * mips it seems. The standard arches will not generate any code here.
2987 * Largest permitted alignment is 256 bytes due to the way we
2988 * handle the index determination for the smaller caches.
2990 * Make sure that nothing crazy happens if someone starts tinkering
2991 * around with ARCH_KMALLOC_MINALIGN
2993 BUILD_BUG_ON(KMALLOC_MIN_SIZE > 256 ||
2994 (KMALLOC_MIN_SIZE & (KMALLOC_MIN_SIZE - 1)));
2996 for (i = 8; i < KMALLOC_MIN_SIZE; i += 8)
2997 size_index[(i - 1) / 8] = KMALLOC_SHIFT_LOW;
2999 /* Provide the correct kmalloc names now that the caches are up */
3000 for (i = KMALLOC_SHIFT_LOW; i <= KMALLOC_SHIFT_SLQB_HIGH; i++) {
3001 kmalloc_caches[i].name =
3002 kasprintf(GFP_KERNEL, "kmalloc-%d", 1 << i);
3003 #ifdef CONFIG_ZONE_DMA
3004 kmalloc_caches_dma[i].name =
3005 kasprintf(GFP_KERNEL, "kmalloc_dma-%d", 1 << i);
3006 #endif
3009 #ifdef CONFIG_SMP
3010 register_cpu_notifier(&slab_notifier);
3011 #endif
3012 #ifdef CONFIG_NUMA
3013 hotplug_memory_notifier(slab_memory_callback, 1);
3014 #endif
3016 * smp_init() has not yet been called, so no worries about memory
3017 * ordering with __slab_is_available.
3019 __slab_is_available = 1;
3022 void __init kmem_cache_init_late(void)
3027 * Some basic slab creation sanity checks
3029 static int kmem_cache_create_ok(const char *name, size_t size,
3030 size_t align, unsigned long flags)
3032 struct kmem_cache *tmp;
3035 * Sanity checks... these are all serious usage bugs.
3037 if (!name || in_interrupt() || (size < sizeof(void *))) {
3038 printk(KERN_ERR "kmem_cache_create(): early error in slab %s\n",
3039 name);
3040 dump_stack();
3042 return 0;
3045 list_for_each_entry(tmp, &slab_caches, list) {
3046 char x;
3047 int res;
3050 * This happens when the module gets unloaded and doesn't
3051 * destroy its slab cache and no-one else reuses the vmalloc
3052 * area of the module. Print a warning.
3054 res = probe_kernel_address(tmp->name, x);
3055 if (res) {
3056 printk(KERN_ERR
3057 "SLAB: cache with size %d has lost its name\n",
3058 tmp->size);
3059 continue;
3062 if (!strcmp(tmp->name, name)) {
3063 printk(KERN_ERR
3064 "SLAB: duplicate cache %s\n", name);
3065 dump_stack();
3067 return 0;
3071 WARN_ON(strchr(name, ' ')); /* It confuses parsers */
3072 if (flags & SLAB_DESTROY_BY_RCU)
3073 WARN_ON(flags & SLAB_POISON);
3075 return 1;
3078 struct kmem_cache *kmem_cache_create(const char *name, size_t size,
3079 size_t align, unsigned long flags, void (*ctor)(void *))
3081 struct kmem_cache *s;
3083 down_write(&slqb_lock);
3084 if (!kmem_cache_create_ok(name, size, align, flags))
3085 goto err;
3087 s = kmem_cache_alloc(&kmem_cache_cache, GFP_KERNEL);
3088 if (!s)
3089 goto err;
3091 if (kmem_cache_open(s, name, size, align, flags, ctor, 1)) {
3092 up_write(&slqb_lock);
3093 return s;
3096 kmem_cache_free(&kmem_cache_cache, s);
3098 err:
3099 up_write(&slqb_lock);
3100 if (flags & SLAB_PANIC)
3101 panic("%s: failed to create slab `%s'\n", __func__, name);
3103 return NULL;
3105 EXPORT_SYMBOL(kmem_cache_create);
3107 #ifdef CONFIG_SMP
3109 * Use the cpu notifier to insure that the cpu slabs are flushed when
3110 * necessary.
3112 static int __cpuinit slab_cpuup_callback(struct notifier_block *nfb,
3113 unsigned long action, void *hcpu)
3115 long cpu = (long)hcpu;
3116 struct kmem_cache *s;
3118 switch (action) {
3119 case CPU_UP_PREPARE:
3120 case CPU_UP_PREPARE_FROZEN:
3121 down_write(&slqb_lock);
3122 list_for_each_entry(s, &slab_caches, list) {
3123 if (s->cpu_slab[cpu]) /* could be lefover last online */
3124 continue;
3125 s->cpu_slab[cpu] = alloc_kmem_cache_cpu(s, cpu);
3126 if (!s->cpu_slab[cpu]) {
3127 up_read(&slqb_lock);
3128 return NOTIFY_BAD;
3131 up_write(&slqb_lock);
3132 break;
3134 case CPU_ONLINE:
3135 case CPU_ONLINE_FROZEN:
3136 case CPU_DOWN_FAILED:
3137 case CPU_DOWN_FAILED_FROZEN:
3138 start_cpu_timer(cpu);
3139 break;
3141 case CPU_DOWN_PREPARE:
3142 case CPU_DOWN_PREPARE_FROZEN:
3143 cancel_rearming_delayed_work(&per_cpu(slqb_cache_trim_work,
3144 cpu));
3145 per_cpu(slqb_cache_trim_work, cpu).work.func = NULL;
3146 break;
3148 case CPU_UP_CANCELED:
3149 case CPU_UP_CANCELED_FROZEN:
3150 case CPU_DEAD:
3151 case CPU_DEAD_FROZEN:
3153 * XXX: Freeing here doesn't work because objects can still be
3154 * on this CPU's list. periodic timer needs to check if a CPU
3155 * is offline and then try to cleanup from there. Same for node
3156 * offline.
3158 default:
3159 break;
3161 return NOTIFY_OK;
3164 static struct notifier_block __cpuinitdata slab_notifier = {
3165 .notifier_call = slab_cpuup_callback
3168 #endif
3170 #ifdef CONFIG_SLQB_DEBUG
3171 void *__kmalloc_track_caller(size_t size, gfp_t flags, unsigned long caller)
3173 struct kmem_cache *s;
3174 int node = -1;
3176 s = get_slab(size, flags);
3177 if (unlikely(ZERO_OR_NULL_PTR(s)))
3178 return s;
3180 #ifdef CONFIG_NUMA
3181 if (unlikely(current->flags & (PF_SPREAD_SLAB | PF_MEMPOLICY)))
3182 node = alternate_nid(s, flags, node);
3183 #endif
3184 return slab_alloc(s, flags, node, caller);
3187 void *__kmalloc_node_track_caller(size_t size, gfp_t flags, int node,
3188 unsigned long caller)
3190 struct kmem_cache *s;
3192 s = get_slab(size, flags);
3193 if (unlikely(ZERO_OR_NULL_PTR(s)))
3194 return s;
3196 return slab_alloc(s, flags, node, caller);
3198 #endif
3200 #if defined(CONFIG_SLQB_SYSFS) || defined(CONFIG_SLABINFO)
3201 struct stats_gather {
3202 struct kmem_cache *s;
3203 spinlock_t lock;
3204 unsigned long nr_slabs;
3205 unsigned long nr_partial;
3206 unsigned long nr_inuse;
3207 unsigned long nr_objects;
3209 #ifdef CONFIG_SLQB_STATS
3210 unsigned long stats[NR_SLQB_STAT_ITEMS];
3211 #endif
3214 static void __gather_stats(void *arg)
3216 unsigned long nr_slabs;
3217 unsigned long nr_partial;
3218 unsigned long nr_inuse;
3219 struct stats_gather *gather = arg;
3220 int cpu = smp_processor_id();
3221 struct kmem_cache *s = gather->s;
3222 struct kmem_cache_cpu *c = get_cpu_slab(s, cpu);
3223 struct kmem_cache_list *l = &c->list;
3224 struct slqb_page *page;
3225 #ifdef CONFIG_SLQB_STATS
3226 int i;
3227 #endif
3229 spin_lock(&l->page_lock);
3230 nr_slabs = l->nr_slabs;
3231 nr_partial = l->nr_partial;
3232 nr_inuse = (nr_slabs - nr_partial) * s->objects;
3234 list_for_each_entry(page, &l->partial, lru) {
3235 nr_inuse += page->inuse;
3237 spin_unlock(&l->page_lock);
3239 spin_lock(&gather->lock);
3240 gather->nr_slabs += nr_slabs;
3241 gather->nr_partial += nr_partial;
3242 gather->nr_inuse += nr_inuse;
3243 #ifdef CONFIG_SLQB_STATS
3244 for (i = 0; i < NR_SLQB_STAT_ITEMS; i++)
3245 gather->stats[i] += l->stats[i];
3246 #endif
3247 spin_unlock(&gather->lock);
3250 /* must be called with slqb_lock held */
3251 static void gather_stats_locked(struct kmem_cache *s,
3252 struct stats_gather *stats)
3254 #ifdef CONFIG_NUMA
3255 int node;
3256 #endif
3258 memset(stats, 0, sizeof(struct stats_gather));
3259 stats->s = s;
3260 spin_lock_init(&stats->lock);
3262 on_each_cpu(__gather_stats, stats, 1);
3264 #ifdef CONFIG_NUMA
3265 for_each_online_node(node) {
3266 struct kmem_cache_node *n = s->node_slab[node];
3267 struct kmem_cache_list *l = &n->list;
3268 struct slqb_page *page;
3269 unsigned long flags;
3270 #ifdef CONFIG_SLQB_STATS
3271 int i;
3272 #endif
3274 spin_lock_irqsave(&n->list_lock, flags);
3275 #ifdef CONFIG_SLQB_STATS
3276 for (i = 0; i < NR_SLQB_STAT_ITEMS; i++)
3277 stats->stats[i] += l->stats[i];
3278 #endif
3279 stats->nr_slabs += l->nr_slabs;
3280 stats->nr_partial += l->nr_partial;
3281 stats->nr_inuse += (l->nr_slabs - l->nr_partial) * s->objects;
3283 list_for_each_entry(page, &l->partial, lru) {
3284 stats->nr_inuse += page->inuse;
3286 spin_unlock_irqrestore(&n->list_lock, flags);
3288 #endif
3290 stats->nr_objects = stats->nr_slabs * s->objects;
3293 #ifdef CONFIG_SLQB_SYSFS
3294 static void gather_stats(struct kmem_cache *s, struct stats_gather *stats)
3296 down_read(&slqb_lock); /* hold off hotplug */
3297 gather_stats_locked(s, stats);
3298 up_read(&slqb_lock);
3300 #endif
3301 #endif
3304 * The /proc/slabinfo ABI
3306 #ifdef CONFIG_SLABINFO
3307 #include <linux/proc_fs.h>
3308 ssize_t slabinfo_write(struct file *file, const char __user * buffer,
3309 size_t count, loff_t *ppos)
3311 return -EINVAL;
3314 static void print_slabinfo_header(struct seq_file *m)
3316 seq_puts(m, "slabinfo - version: 2.1\n");
3317 seq_puts(m, "# name <active_objs> <num_objs> <objsize> "
3318 "<objperslab> <pagesperslab>");
3319 seq_puts(m, " : tunables <limit> <batchcount> <sharedfactor>");
3320 seq_puts(m, " : slabdata <active_slabs> <num_slabs> <sharedavail>");
3321 seq_putc(m, '\n');
3324 static void *s_start(struct seq_file *m, loff_t *pos)
3326 loff_t n = *pos;
3328 down_read(&slqb_lock);
3329 if (!n)
3330 print_slabinfo_header(m);
3332 return seq_list_start(&slab_caches, *pos);
3335 static void *s_next(struct seq_file *m, void *p, loff_t *pos)
3337 return seq_list_next(p, &slab_caches, pos);
3340 static void s_stop(struct seq_file *m, void *p)
3342 up_read(&slqb_lock);
3345 static int s_show(struct seq_file *m, void *p)
3347 struct stats_gather stats;
3348 struct kmem_cache *s;
3350 s = list_entry(p, struct kmem_cache, list);
3352 gather_stats_locked(s, &stats);
3354 seq_printf(m, "%-17s %6lu %6lu %6u %4u %4d", s->name, stats.nr_inuse,
3355 stats.nr_objects, s->size, s->objects, (1 << s->order));
3356 seq_printf(m, " : tunables %4u %4u %4u", slab_hiwater(s),
3357 slab_freebatch(s), 0);
3358 seq_printf(m, " : slabdata %6lu %6lu %6lu", stats.nr_slabs,
3359 stats.nr_slabs, 0UL);
3360 seq_putc(m, '\n');
3361 return 0;
3364 static const struct seq_operations slabinfo_op = {
3365 .start = s_start,
3366 .next = s_next,
3367 .stop = s_stop,
3368 .show = s_show,
3371 static int slabinfo_open(struct inode *inode, struct file *file)
3373 return seq_open(file, &slabinfo_op);
3376 static const struct file_operations proc_slabinfo_operations = {
3377 .open = slabinfo_open,
3378 .read = seq_read,
3379 .llseek = seq_lseek,
3380 .release = seq_release,
3383 static int __init slab_proc_init(void)
3385 proc_create("slabinfo", S_IWUSR|S_IRUGO, NULL,
3386 &proc_slabinfo_operations);
3387 return 0;
3389 module_init(slab_proc_init);
3390 #endif /* CONFIG_SLABINFO */
3392 #ifdef CONFIG_SLQB_SYSFS
3394 * sysfs API
3396 #define to_slab_attr(n) container_of(n, struct slab_attribute, attr)
3397 #define to_slab(n) container_of(n, struct kmem_cache, kobj);
3399 struct slab_attribute {
3400 struct attribute attr;
3401 ssize_t (*show)(struct kmem_cache *s, char *buf);
3402 ssize_t (*store)(struct kmem_cache *s, const char *x, size_t count);
3405 #define SLAB_ATTR_RO(_name) \
3406 static struct slab_attribute _name##_attr = __ATTR_RO(_name)
3408 #define SLAB_ATTR(_name) \
3409 static struct slab_attribute _name##_attr = \
3410 __ATTR(_name, 0644, _name##_show, _name##_store)
3412 static ssize_t slab_size_show(struct kmem_cache *s, char *buf)
3414 return sprintf(buf, "%d\n", s->size);
3416 SLAB_ATTR_RO(slab_size);
3418 static ssize_t align_show(struct kmem_cache *s, char *buf)
3420 return sprintf(buf, "%d\n", s->align);
3422 SLAB_ATTR_RO(align);
3424 static ssize_t object_size_show(struct kmem_cache *s, char *buf)
3426 return sprintf(buf, "%d\n", s->objsize);
3428 SLAB_ATTR_RO(object_size);
3430 static ssize_t objs_per_slab_show(struct kmem_cache *s, char *buf)
3432 return sprintf(buf, "%d\n", s->objects);
3434 SLAB_ATTR_RO(objs_per_slab);
3436 static ssize_t order_show(struct kmem_cache *s, char *buf)
3438 return sprintf(buf, "%d\n", s->order);
3440 SLAB_ATTR_RO(order);
3442 static ssize_t ctor_show(struct kmem_cache *s, char *buf)
3444 if (s->ctor) {
3445 int n = sprint_symbol(buf, (unsigned long)s->ctor);
3447 return n + sprintf(buf + n, "\n");
3449 return 0;
3451 SLAB_ATTR_RO(ctor);
3453 static ssize_t slabs_show(struct kmem_cache *s, char *buf)
3455 struct stats_gather stats;
3457 gather_stats(s, &stats);
3459 return sprintf(buf, "%lu\n", stats.nr_slabs);
3461 SLAB_ATTR_RO(slabs);
3463 static ssize_t objects_show(struct kmem_cache *s, char *buf)
3465 struct stats_gather stats;
3467 gather_stats(s, &stats);
3469 return sprintf(buf, "%lu\n", stats.nr_inuse);
3471 SLAB_ATTR_RO(objects);
3473 static ssize_t total_objects_show(struct kmem_cache *s, char *buf)
3475 struct stats_gather stats;
3477 gather_stats(s, &stats);
3479 return sprintf(buf, "%lu\n", stats.nr_objects);
3481 SLAB_ATTR_RO(total_objects);
3483 #ifdef CONFIG_FAILSLAB
3484 static ssize_t failslab_show(struct kmem_cache *s, char *buf)
3486 return sprintf(buf, "%d\n", !!(s->flags & SLAB_FAILSLAB));
3489 static ssize_t failslab_store(struct kmem_cache *s, const char *buf,
3490 size_t length)
3492 s->flags &= ~SLAB_FAILSLAB;
3493 if (buf[0] == '1')
3494 s->flags |= SLAB_FAILSLAB;
3495 return length;
3497 SLAB_ATTR(failslab);
3498 #endif
3500 static ssize_t reclaim_account_show(struct kmem_cache *s, char *buf)
3502 return sprintf(buf, "%d\n", !!(s->flags & SLAB_RECLAIM_ACCOUNT));
3504 SLAB_ATTR_RO(reclaim_account);
3506 static ssize_t hwcache_align_show(struct kmem_cache *s, char *buf)
3508 return sprintf(buf, "%d\n", !!(s->flags & SLAB_HWCACHE_ALIGN));
3510 SLAB_ATTR_RO(hwcache_align);
3512 #ifdef CONFIG_ZONE_DMA
3513 static ssize_t cache_dma_show(struct kmem_cache *s, char *buf)
3515 return sprintf(buf, "%d\n", !!(s->flags & SLAB_CACHE_DMA));
3517 SLAB_ATTR_RO(cache_dma);
3518 #endif
3520 static ssize_t destroy_by_rcu_show(struct kmem_cache *s, char *buf)
3522 return sprintf(buf, "%d\n", !!(s->flags & SLAB_DESTROY_BY_RCU));
3524 SLAB_ATTR_RO(destroy_by_rcu);
3526 static ssize_t red_zone_show(struct kmem_cache *s, char *buf)
3528 return sprintf(buf, "%d\n", !!(s->flags & SLAB_RED_ZONE));
3530 SLAB_ATTR_RO(red_zone);
3532 static ssize_t poison_show(struct kmem_cache *s, char *buf)
3534 return sprintf(buf, "%d\n", !!(s->flags & SLAB_POISON));
3536 SLAB_ATTR_RO(poison);
3538 static ssize_t store_user_show(struct kmem_cache *s, char *buf)
3540 return sprintf(buf, "%d\n", !!(s->flags & SLAB_STORE_USER));
3542 SLAB_ATTR_RO(store_user);
3544 static ssize_t hiwater_store(struct kmem_cache *s,
3545 const char *buf, size_t length)
3547 long hiwater;
3548 int err;
3550 err = strict_strtol(buf, 10, &hiwater);
3551 if (err)
3552 return err;
3554 if (hiwater < 0)
3555 return -EINVAL;
3557 s->hiwater = hiwater;
3559 return length;
3562 static ssize_t hiwater_show(struct kmem_cache *s, char *buf)
3564 return sprintf(buf, "%d\n", slab_hiwater(s));
3566 SLAB_ATTR(hiwater);
3568 static ssize_t freebatch_store(struct kmem_cache *s,
3569 const char *buf, size_t length)
3571 long freebatch;
3572 int err;
3574 err = strict_strtol(buf, 10, &freebatch);
3575 if (err)
3576 return err;
3578 if (freebatch <= 0 || freebatch - 1 > s->hiwater)
3579 return -EINVAL;
3581 s->freebatch = freebatch;
3583 return length;
3586 static ssize_t freebatch_show(struct kmem_cache *s, char *buf)
3588 return sprintf(buf, "%d\n", slab_freebatch(s));
3590 SLAB_ATTR(freebatch);
3592 #ifdef CONFIG_SLQB_STATS
3593 static int show_stat(struct kmem_cache *s, char *buf, enum stat_item si)
3595 struct stats_gather stats;
3596 int len;
3597 #ifdef CONFIG_SMP
3598 int cpu;
3599 #endif
3601 gather_stats(s, &stats);
3603 len = sprintf(buf, "%lu", stats.stats[si]);
3605 #ifdef CONFIG_SMP
3606 for_each_online_cpu(cpu) {
3607 struct kmem_cache_cpu *c = get_cpu_slab(s, cpu);
3608 struct kmem_cache_list *l = &c->list;
3610 if (len < PAGE_SIZE - 20)
3611 len += sprintf(buf+len, " C%d=%lu", cpu, l->stats[si]);
3613 #endif
3614 return len + sprintf(buf + len, "\n");
3617 #define STAT_ATTR(si, text) \
3618 static ssize_t text##_show(struct kmem_cache *s, char *buf) \
3620 return show_stat(s, buf, si); \
3622 SLAB_ATTR_RO(text); \
3624 STAT_ATTR(ALLOC, alloc);
3625 STAT_ATTR(ALLOC_SLAB_FILL, alloc_slab_fill);
3626 STAT_ATTR(ALLOC_SLAB_NEW, alloc_slab_new);
3627 STAT_ATTR(FREE, free);
3628 STAT_ATTR(FREE_REMOTE, free_remote);
3629 STAT_ATTR(FLUSH_FREE_LIST, flush_free_list);
3630 STAT_ATTR(FLUSH_FREE_LIST_OBJECTS, flush_free_list_objects);
3631 STAT_ATTR(FLUSH_FREE_LIST_REMOTE, flush_free_list_remote);
3632 STAT_ATTR(FLUSH_SLAB_PARTIAL, flush_slab_partial);
3633 STAT_ATTR(FLUSH_SLAB_FREE, flush_slab_free);
3634 STAT_ATTR(FLUSH_RFREE_LIST, flush_rfree_list);
3635 STAT_ATTR(FLUSH_RFREE_LIST_OBJECTS, flush_rfree_list_objects);
3636 STAT_ATTR(CLAIM_REMOTE_LIST, claim_remote_list);
3637 STAT_ATTR(CLAIM_REMOTE_LIST_OBJECTS, claim_remote_list_objects);
3638 #endif
3640 static struct attribute *slab_attrs[] = {
3641 &slab_size_attr.attr,
3642 &object_size_attr.attr,
3643 &objs_per_slab_attr.attr,
3644 &order_attr.attr,
3645 &objects_attr.attr,
3646 &total_objects_attr.attr,
3647 &slabs_attr.attr,
3648 &ctor_attr.attr,
3649 &align_attr.attr,
3650 &hwcache_align_attr.attr,
3651 &reclaim_account_attr.attr,
3652 &destroy_by_rcu_attr.attr,
3653 &red_zone_attr.attr,
3654 &poison_attr.attr,
3655 &store_user_attr.attr,
3656 &hiwater_attr.attr,
3657 &freebatch_attr.attr,
3658 #ifdef CONFIG_ZONE_DMA
3659 &cache_dma_attr.attr,
3660 #endif
3661 #ifdef CONFIG_SLQB_STATS
3662 &alloc_attr.attr,
3663 &alloc_slab_fill_attr.attr,
3664 &alloc_slab_new_attr.attr,
3665 &free_attr.attr,
3666 &free_remote_attr.attr,
3667 &flush_free_list_attr.attr,
3668 &flush_free_list_objects_attr.attr,
3669 &flush_free_list_remote_attr.attr,
3670 &flush_slab_partial_attr.attr,
3671 &flush_slab_free_attr.attr,
3672 &flush_rfree_list_attr.attr,
3673 &flush_rfree_list_objects_attr.attr,
3674 &claim_remote_list_attr.attr,
3675 &claim_remote_list_objects_attr.attr,
3676 #endif
3677 #ifdef CONFIG_FAILSLAB
3678 &failslab_attr.attr,
3679 #endif
3681 NULL
3684 static struct attribute_group slab_attr_group = {
3685 .attrs = slab_attrs,
3688 static ssize_t slab_attr_show(struct kobject *kobj,
3689 struct attribute *attr, char *buf)
3691 struct slab_attribute *attribute;
3692 struct kmem_cache *s;
3693 int err;
3695 attribute = to_slab_attr(attr);
3696 s = to_slab(kobj);
3698 if (!attribute->show)
3699 return -EIO;
3701 err = attribute->show(s, buf);
3703 return err;
3706 static ssize_t slab_attr_store(struct kobject *kobj,
3707 struct attribute *attr, const char *buf, size_t len)
3709 struct slab_attribute *attribute;
3710 struct kmem_cache *s;
3711 int err;
3713 attribute = to_slab_attr(attr);
3714 s = to_slab(kobj);
3716 if (!attribute->store)
3717 return -EIO;
3719 err = attribute->store(s, buf, len);
3721 return err;
3724 static void kmem_cache_release(struct kobject *kobj)
3726 struct kmem_cache *s = to_slab(kobj);
3728 kmem_cache_free(&kmem_cache_cache, s);
3731 static struct sysfs_ops slab_sysfs_ops = {
3732 .show = slab_attr_show,
3733 .store = slab_attr_store,
3736 static struct kobj_type slab_ktype = {
3737 .sysfs_ops = &slab_sysfs_ops,
3738 .release = kmem_cache_release
3741 static int uevent_filter(struct kset *kset, struct kobject *kobj)
3743 struct kobj_type *ktype = get_ktype(kobj);
3745 if (ktype == &slab_ktype)
3746 return 1;
3747 return 0;
3750 static struct kset_uevent_ops slab_uevent_ops = {
3751 .filter = uevent_filter,
3754 static struct kset *slab_kset;
3756 static int sysfs_available __read_mostly;
3758 static int sysfs_slab_add(struct kmem_cache *s)
3760 int err;
3762 if (!sysfs_available)
3763 return 0;
3765 s->kobj.kset = slab_kset;
3766 err = kobject_init_and_add(&s->kobj, &slab_ktype, NULL, s->name);
3767 if (err) {
3768 kobject_put(&s->kobj);
3769 return err;
3772 err = sysfs_create_group(&s->kobj, &slab_attr_group);
3773 if (err)
3774 return err;
3776 kobject_uevent(&s->kobj, KOBJ_ADD);
3778 return 0;
3781 static void sysfs_slab_remove(struct kmem_cache *s)
3783 kobject_uevent(&s->kobj, KOBJ_REMOVE);
3784 kobject_del(&s->kobj);
3785 kobject_put(&s->kobj);
3788 static int __init slab_sysfs_init(void)
3790 struct kmem_cache *s;
3791 int err;
3793 slab_kset = kset_create_and_add("slab", &slab_uevent_ops, kernel_kobj);
3794 if (!slab_kset) {
3795 printk(KERN_ERR "Cannot register slab subsystem.\n");
3796 return -ENOSYS;
3799 down_write(&slqb_lock);
3801 sysfs_available = 1;
3803 list_for_each_entry(s, &slab_caches, list) {
3804 err = sysfs_slab_add(s);
3805 if (err)
3806 printk(KERN_ERR "SLQB: Unable to add boot slab %s"
3807 " to sysfs\n", s->name);
3810 up_write(&slqb_lock);
3812 return 0;
3814 device_initcall(slab_sysfs_init);
3816 #endif