Linux v2.6.15-rc7
[pohmelfs.git] / mm / mempolicy.c
blobbec88c81244e0d4f9541f4098f5f8c8409ba455b
1 /*
2 * Simple NUMA memory policy for the Linux kernel.
4 * Copyright 2003,2004 Andi Kleen, SuSE Labs.
5 * (C) Copyright 2005 Christoph Lameter, Silicon Graphics, Inc.
6 * Subject to the GNU Public License, version 2.
8 * NUMA policy allows the user to give hints in which node(s) memory should
9 * be allocated.
11 * Support four policies per VMA and per process:
13 * The VMA policy has priority over the process policy for a page fault.
15 * interleave Allocate memory interleaved over a set of nodes,
16 * with normal fallback if it fails.
17 * For VMA based allocations this interleaves based on the
18 * offset into the backing object or offset into the mapping
19 * for anonymous memory. For process policy an process counter
20 * is used.
22 * bind Only allocate memory on a specific set of nodes,
23 * no fallback.
24 * FIXME: memory is allocated starting with the first node
25 * to the last. It would be better if bind would truly restrict
26 * the allocation to memory nodes instead
28 * preferred Try a specific node first before normal fallback.
29 * As a special case node -1 here means do the allocation
30 * on the local CPU. This is normally identical to default,
31 * but useful to set in a VMA when you have a non default
32 * process policy.
34 * default Allocate on the local node first, or when on a VMA
35 * use the process policy. This is what Linux always did
36 * in a NUMA aware kernel and still does by, ahem, default.
38 * The process policy is applied for most non interrupt memory allocations
39 * in that process' context. Interrupts ignore the policies and always
40 * try to allocate on the local CPU. The VMA policy is only applied for memory
41 * allocations for a VMA in the VM.
43 * Currently there are a few corner cases in swapping where the policy
44 * is not applied, but the majority should be handled. When process policy
45 * is used it is not remembered over swap outs/swap ins.
47 * Only the highest zone in the zone hierarchy gets policied. Allocations
48 * requesting a lower zone just use default policy. This implies that
49 * on systems with highmem kernel lowmem allocation don't get policied.
50 * Same with GFP_DMA allocations.
52 * For shmfs/tmpfs/hugetlbfs shared memory the policy is shared between
53 * all users and remembered even when nobody has memory mapped.
56 /* Notebook:
57 fix mmap readahead to honour policy and enable policy for any page cache
58 object
59 statistics for bigpages
60 global policy for page cache? currently it uses process policy. Requires
61 first item above.
62 handle mremap for shared memory (currently ignored for the policy)
63 grows down?
64 make bind policy root only? It can trigger oom much faster and the
65 kernel is not always grateful with that.
66 could replace all the switch()es with a mempolicy_ops structure.
69 #include <linux/mempolicy.h>
70 #include <linux/mm.h>
71 #include <linux/highmem.h>
72 #include <linux/hugetlb.h>
73 #include <linux/kernel.h>
74 #include <linux/sched.h>
75 #include <linux/mm.h>
76 #include <linux/nodemask.h>
77 #include <linux/cpuset.h>
78 #include <linux/gfp.h>
79 #include <linux/slab.h>
80 #include <linux/string.h>
81 #include <linux/module.h>
82 #include <linux/interrupt.h>
83 #include <linux/init.h>
84 #include <linux/compat.h>
85 #include <linux/mempolicy.h>
86 #include <asm/tlbflush.h>
87 #include <asm/uaccess.h>
89 static kmem_cache_t *policy_cache;
90 static kmem_cache_t *sn_cache;
92 #define PDprintk(fmt...)
94 /* Highest zone. An specific allocation for a zone below that is not
95 policied. */
96 static int policy_zone;
98 struct mempolicy default_policy = {
99 .refcnt = ATOMIC_INIT(1), /* never free it */
100 .policy = MPOL_DEFAULT,
103 /* Do sanity checking on a policy */
104 static int mpol_check_policy(int mode, nodemask_t *nodes)
106 int empty = nodes_empty(*nodes);
108 switch (mode) {
109 case MPOL_DEFAULT:
110 if (!empty)
111 return -EINVAL;
112 break;
113 case MPOL_BIND:
114 case MPOL_INTERLEAVE:
115 /* Preferred will only use the first bit, but allow
116 more for now. */
117 if (empty)
118 return -EINVAL;
119 break;
121 return nodes_subset(*nodes, node_online_map) ? 0 : -EINVAL;
123 /* Generate a custom zonelist for the BIND policy. */
124 static struct zonelist *bind_zonelist(nodemask_t *nodes)
126 struct zonelist *zl;
127 int num, max, nd;
129 max = 1 + MAX_NR_ZONES * nodes_weight(*nodes);
130 zl = kmalloc(sizeof(void *) * max, GFP_KERNEL);
131 if (!zl)
132 return NULL;
133 num = 0;
134 for_each_node_mask(nd, *nodes) {
135 int k;
136 for (k = MAX_NR_ZONES-1; k >= 0; k--) {
137 struct zone *z = &NODE_DATA(nd)->node_zones[k];
138 if (!z->present_pages)
139 continue;
140 zl->zones[num++] = z;
141 if (k > policy_zone)
142 policy_zone = k;
145 zl->zones[num] = NULL;
146 return zl;
149 /* Create a new policy */
150 static struct mempolicy *mpol_new(int mode, nodemask_t *nodes)
152 struct mempolicy *policy;
154 PDprintk("setting mode %d nodes[0] %lx\n", mode, nodes_addr(*nodes)[0]);
155 if (mode == MPOL_DEFAULT)
156 return NULL;
157 policy = kmem_cache_alloc(policy_cache, GFP_KERNEL);
158 if (!policy)
159 return ERR_PTR(-ENOMEM);
160 atomic_set(&policy->refcnt, 1);
161 switch (mode) {
162 case MPOL_INTERLEAVE:
163 policy->v.nodes = *nodes;
164 break;
165 case MPOL_PREFERRED:
166 policy->v.preferred_node = first_node(*nodes);
167 if (policy->v.preferred_node >= MAX_NUMNODES)
168 policy->v.preferred_node = -1;
169 break;
170 case MPOL_BIND:
171 policy->v.zonelist = bind_zonelist(nodes);
172 if (policy->v.zonelist == NULL) {
173 kmem_cache_free(policy_cache, policy);
174 return ERR_PTR(-ENOMEM);
176 break;
178 policy->policy = mode;
179 return policy;
182 /* Ensure all existing pages follow the policy. */
183 static int check_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
184 unsigned long addr, unsigned long end, nodemask_t *nodes)
186 pte_t *orig_pte;
187 pte_t *pte;
188 spinlock_t *ptl;
190 orig_pte = pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
191 do {
192 struct page *page;
193 unsigned int nid;
195 if (!pte_present(*pte))
196 continue;
197 page = vm_normal_page(vma, addr, *pte);
198 if (!page)
199 continue;
200 nid = page_to_nid(page);
201 if (!node_isset(nid, *nodes))
202 break;
203 } while (pte++, addr += PAGE_SIZE, addr != end);
204 pte_unmap_unlock(orig_pte, ptl);
205 return addr != end;
208 static inline int check_pmd_range(struct vm_area_struct *vma, pud_t *pud,
209 unsigned long addr, unsigned long end, nodemask_t *nodes)
211 pmd_t *pmd;
212 unsigned long next;
214 pmd = pmd_offset(pud, addr);
215 do {
216 next = pmd_addr_end(addr, end);
217 if (pmd_none_or_clear_bad(pmd))
218 continue;
219 if (check_pte_range(vma, pmd, addr, next, nodes))
220 return -EIO;
221 } while (pmd++, addr = next, addr != end);
222 return 0;
225 static inline int check_pud_range(struct vm_area_struct *vma, pgd_t *pgd,
226 unsigned long addr, unsigned long end, nodemask_t *nodes)
228 pud_t *pud;
229 unsigned long next;
231 pud = pud_offset(pgd, addr);
232 do {
233 next = pud_addr_end(addr, end);
234 if (pud_none_or_clear_bad(pud))
235 continue;
236 if (check_pmd_range(vma, pud, addr, next, nodes))
237 return -EIO;
238 } while (pud++, addr = next, addr != end);
239 return 0;
242 static inline int check_pgd_range(struct vm_area_struct *vma,
243 unsigned long addr, unsigned long end, nodemask_t *nodes)
245 pgd_t *pgd;
246 unsigned long next;
248 pgd = pgd_offset(vma->vm_mm, addr);
249 do {
250 next = pgd_addr_end(addr, end);
251 if (pgd_none_or_clear_bad(pgd))
252 continue;
253 if (check_pud_range(vma, pgd, addr, next, nodes))
254 return -EIO;
255 } while (pgd++, addr = next, addr != end);
256 return 0;
259 /* Step 1: check the range */
260 static struct vm_area_struct *
261 check_range(struct mm_struct *mm, unsigned long start, unsigned long end,
262 nodemask_t *nodes, unsigned long flags)
264 int err;
265 struct vm_area_struct *first, *vma, *prev;
267 first = find_vma(mm, start);
268 if (!first)
269 return ERR_PTR(-EFAULT);
270 prev = NULL;
271 for (vma = first; vma && vma->vm_start < end; vma = vma->vm_next) {
272 if (!vma->vm_next && vma->vm_end < end)
273 return ERR_PTR(-EFAULT);
274 if (prev && prev->vm_end < vma->vm_start)
275 return ERR_PTR(-EFAULT);
276 if ((flags & MPOL_MF_STRICT) && !is_vm_hugetlb_page(vma)) {
277 unsigned long endvma = vma->vm_end;
278 if (endvma > end)
279 endvma = end;
280 if (vma->vm_start > start)
281 start = vma->vm_start;
282 err = check_pgd_range(vma, start, endvma, nodes);
283 if (err) {
284 first = ERR_PTR(err);
285 break;
288 prev = vma;
290 return first;
293 /* Apply policy to a single VMA */
294 static int policy_vma(struct vm_area_struct *vma, struct mempolicy *new)
296 int err = 0;
297 struct mempolicy *old = vma->vm_policy;
299 PDprintk("vma %lx-%lx/%lx vm_ops %p vm_file %p set_policy %p\n",
300 vma->vm_start, vma->vm_end, vma->vm_pgoff,
301 vma->vm_ops, vma->vm_file,
302 vma->vm_ops ? vma->vm_ops->set_policy : NULL);
304 if (vma->vm_ops && vma->vm_ops->set_policy)
305 err = vma->vm_ops->set_policy(vma, new);
306 if (!err) {
307 mpol_get(new);
308 vma->vm_policy = new;
309 mpol_free(old);
311 return err;
314 /* Step 2: apply policy to a range and do splits. */
315 static int mbind_range(struct vm_area_struct *vma, unsigned long start,
316 unsigned long end, struct mempolicy *new)
318 struct vm_area_struct *next;
319 int err;
321 err = 0;
322 for (; vma && vma->vm_start < end; vma = next) {
323 next = vma->vm_next;
324 if (vma->vm_start < start)
325 err = split_vma(vma->vm_mm, vma, start, 1);
326 if (!err && vma->vm_end > end)
327 err = split_vma(vma->vm_mm, vma, end, 0);
328 if (!err)
329 err = policy_vma(vma, new);
330 if (err)
331 break;
333 return err;
336 static int contextualize_policy(int mode, nodemask_t *nodes)
338 if (!nodes)
339 return 0;
341 /* Update current mems_allowed */
342 cpuset_update_current_mems_allowed();
343 /* Ignore nodes not set in current->mems_allowed */
344 cpuset_restrict_to_mems_allowed(nodes->bits);
345 return mpol_check_policy(mode, nodes);
348 long do_mbind(unsigned long start, unsigned long len,
349 unsigned long mode, nodemask_t *nmask, unsigned long flags)
351 struct vm_area_struct *vma;
352 struct mm_struct *mm = current->mm;
353 struct mempolicy *new;
354 unsigned long end;
355 int err;
357 if ((flags & ~(unsigned long)(MPOL_MF_STRICT)) || mode > MPOL_MAX)
358 return -EINVAL;
359 if (start & ~PAGE_MASK)
360 return -EINVAL;
361 if (mode == MPOL_DEFAULT)
362 flags &= ~MPOL_MF_STRICT;
363 len = (len + PAGE_SIZE - 1) & PAGE_MASK;
364 end = start + len;
365 if (end < start)
366 return -EINVAL;
367 if (end == start)
368 return 0;
369 if (mpol_check_policy(mode, nmask))
370 return -EINVAL;
371 new = mpol_new(mode, nmask);
372 if (IS_ERR(new))
373 return PTR_ERR(new);
375 PDprintk("mbind %lx-%lx mode:%ld nodes:%lx\n",start,start+len,
376 mode,nodes_addr(nodes)[0]);
378 down_write(&mm->mmap_sem);
379 vma = check_range(mm, start, end, nmask, flags);
380 err = PTR_ERR(vma);
381 if (!IS_ERR(vma))
382 err = mbind_range(vma, start, end, new);
383 up_write(&mm->mmap_sem);
384 mpol_free(new);
385 return err;
388 /* Set the process memory policy */
389 long do_set_mempolicy(int mode, nodemask_t *nodes)
391 struct mempolicy *new;
393 if (contextualize_policy(mode, nodes))
394 return -EINVAL;
395 new = mpol_new(mode, nodes);
396 if (IS_ERR(new))
397 return PTR_ERR(new);
398 mpol_free(current->mempolicy);
399 current->mempolicy = new;
400 if (new && new->policy == MPOL_INTERLEAVE)
401 current->il_next = first_node(new->v.nodes);
402 return 0;
405 /* Fill a zone bitmap for a policy */
406 static void get_zonemask(struct mempolicy *p, nodemask_t *nodes)
408 int i;
410 nodes_clear(*nodes);
411 switch (p->policy) {
412 case MPOL_BIND:
413 for (i = 0; p->v.zonelist->zones[i]; i++)
414 node_set(p->v.zonelist->zones[i]->zone_pgdat->node_id,
415 *nodes);
416 break;
417 case MPOL_DEFAULT:
418 break;
419 case MPOL_INTERLEAVE:
420 *nodes = p->v.nodes;
421 break;
422 case MPOL_PREFERRED:
423 /* or use current node instead of online map? */
424 if (p->v.preferred_node < 0)
425 *nodes = node_online_map;
426 else
427 node_set(p->v.preferred_node, *nodes);
428 break;
429 default:
430 BUG();
434 static int lookup_node(struct mm_struct *mm, unsigned long addr)
436 struct page *p;
437 int err;
439 err = get_user_pages(current, mm, addr & PAGE_MASK, 1, 0, 0, &p, NULL);
440 if (err >= 0) {
441 err = page_to_nid(p);
442 put_page(p);
444 return err;
447 /* Retrieve NUMA policy */
448 long do_get_mempolicy(int *policy, nodemask_t *nmask,
449 unsigned long addr, unsigned long flags)
451 int err;
452 struct mm_struct *mm = current->mm;
453 struct vm_area_struct *vma = NULL;
454 struct mempolicy *pol = current->mempolicy;
456 cpuset_update_current_mems_allowed();
457 if (flags & ~(unsigned long)(MPOL_F_NODE|MPOL_F_ADDR))
458 return -EINVAL;
459 if (flags & MPOL_F_ADDR) {
460 down_read(&mm->mmap_sem);
461 vma = find_vma_intersection(mm, addr, addr+1);
462 if (!vma) {
463 up_read(&mm->mmap_sem);
464 return -EFAULT;
466 if (vma->vm_ops && vma->vm_ops->get_policy)
467 pol = vma->vm_ops->get_policy(vma, addr);
468 else
469 pol = vma->vm_policy;
470 } else if (addr)
471 return -EINVAL;
473 if (!pol)
474 pol = &default_policy;
476 if (flags & MPOL_F_NODE) {
477 if (flags & MPOL_F_ADDR) {
478 err = lookup_node(mm, addr);
479 if (err < 0)
480 goto out;
481 *policy = err;
482 } else if (pol == current->mempolicy &&
483 pol->policy == MPOL_INTERLEAVE) {
484 *policy = current->il_next;
485 } else {
486 err = -EINVAL;
487 goto out;
489 } else
490 *policy = pol->policy;
492 if (vma) {
493 up_read(&current->mm->mmap_sem);
494 vma = NULL;
497 err = 0;
498 if (nmask)
499 get_zonemask(pol, nmask);
501 out:
502 if (vma)
503 up_read(&current->mm->mmap_sem);
504 return err;
508 * User space interface with variable sized bitmaps for nodelists.
511 /* Copy a node mask from user space. */
512 static int get_nodes(nodemask_t *nodes, unsigned long __user *nmask,
513 unsigned long maxnode)
515 unsigned long k;
516 unsigned long nlongs;
517 unsigned long endmask;
519 --maxnode;
520 nodes_clear(*nodes);
521 if (maxnode == 0 || !nmask)
522 return 0;
524 nlongs = BITS_TO_LONGS(maxnode);
525 if ((maxnode % BITS_PER_LONG) == 0)
526 endmask = ~0UL;
527 else
528 endmask = (1UL << (maxnode % BITS_PER_LONG)) - 1;
530 /* When the user specified more nodes than supported just check
531 if the non supported part is all zero. */
532 if (nlongs > BITS_TO_LONGS(MAX_NUMNODES)) {
533 if (nlongs > PAGE_SIZE/sizeof(long))
534 return -EINVAL;
535 for (k = BITS_TO_LONGS(MAX_NUMNODES); k < nlongs; k++) {
536 unsigned long t;
537 if (get_user(t, nmask + k))
538 return -EFAULT;
539 if (k == nlongs - 1) {
540 if (t & endmask)
541 return -EINVAL;
542 } else if (t)
543 return -EINVAL;
545 nlongs = BITS_TO_LONGS(MAX_NUMNODES);
546 endmask = ~0UL;
549 if (copy_from_user(nodes_addr(*nodes), nmask, nlongs*sizeof(unsigned long)))
550 return -EFAULT;
551 nodes_addr(*nodes)[nlongs-1] &= endmask;
552 return 0;
555 /* Copy a kernel node mask to user space */
556 static int copy_nodes_to_user(unsigned long __user *mask, unsigned long maxnode,
557 nodemask_t *nodes)
559 unsigned long copy = ALIGN(maxnode-1, 64) / 8;
560 const int nbytes = BITS_TO_LONGS(MAX_NUMNODES) * sizeof(long);
562 if (copy > nbytes) {
563 if (copy > PAGE_SIZE)
564 return -EINVAL;
565 if (clear_user((char __user *)mask + nbytes, copy - nbytes))
566 return -EFAULT;
567 copy = nbytes;
569 return copy_to_user(mask, nodes_addr(*nodes), copy) ? -EFAULT : 0;
572 asmlinkage long sys_mbind(unsigned long start, unsigned long len,
573 unsigned long mode,
574 unsigned long __user *nmask, unsigned long maxnode,
575 unsigned flags)
577 nodemask_t nodes;
578 int err;
580 err = get_nodes(&nodes, nmask, maxnode);
581 if (err)
582 return err;
583 return do_mbind(start, len, mode, &nodes, flags);
586 /* Set the process memory policy */
587 asmlinkage long sys_set_mempolicy(int mode, unsigned long __user *nmask,
588 unsigned long maxnode)
590 int err;
591 nodemask_t nodes;
593 if (mode < 0 || mode > MPOL_MAX)
594 return -EINVAL;
595 err = get_nodes(&nodes, nmask, maxnode);
596 if (err)
597 return err;
598 return do_set_mempolicy(mode, &nodes);
601 /* Retrieve NUMA policy */
602 asmlinkage long sys_get_mempolicy(int __user *policy,
603 unsigned long __user *nmask,
604 unsigned long maxnode,
605 unsigned long addr, unsigned long flags)
607 int err, pval;
608 nodemask_t nodes;
610 if (nmask != NULL && maxnode < MAX_NUMNODES)
611 return -EINVAL;
613 err = do_get_mempolicy(&pval, &nodes, addr, flags);
615 if (err)
616 return err;
618 if (policy && put_user(pval, policy))
619 return -EFAULT;
621 if (nmask)
622 err = copy_nodes_to_user(nmask, maxnode, &nodes);
624 return err;
627 #ifdef CONFIG_COMPAT
629 asmlinkage long compat_sys_get_mempolicy(int __user *policy,
630 compat_ulong_t __user *nmask,
631 compat_ulong_t maxnode,
632 compat_ulong_t addr, compat_ulong_t flags)
634 long err;
635 unsigned long __user *nm = NULL;
636 unsigned long nr_bits, alloc_size;
637 DECLARE_BITMAP(bm, MAX_NUMNODES);
639 nr_bits = min_t(unsigned long, maxnode-1, MAX_NUMNODES);
640 alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8;
642 if (nmask)
643 nm = compat_alloc_user_space(alloc_size);
645 err = sys_get_mempolicy(policy, nm, nr_bits+1, addr, flags);
647 if (!err && nmask) {
648 err = copy_from_user(bm, nm, alloc_size);
649 /* ensure entire bitmap is zeroed */
650 err |= clear_user(nmask, ALIGN(maxnode-1, 8) / 8);
651 err |= compat_put_bitmap(nmask, bm, nr_bits);
654 return err;
657 asmlinkage long compat_sys_set_mempolicy(int mode, compat_ulong_t __user *nmask,
658 compat_ulong_t maxnode)
660 long err = 0;
661 unsigned long __user *nm = NULL;
662 unsigned long nr_bits, alloc_size;
663 DECLARE_BITMAP(bm, MAX_NUMNODES);
665 nr_bits = min_t(unsigned long, maxnode-1, MAX_NUMNODES);
666 alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8;
668 if (nmask) {
669 err = compat_get_bitmap(bm, nmask, nr_bits);
670 nm = compat_alloc_user_space(alloc_size);
671 err |= copy_to_user(nm, bm, alloc_size);
674 if (err)
675 return -EFAULT;
677 return sys_set_mempolicy(mode, nm, nr_bits+1);
680 asmlinkage long compat_sys_mbind(compat_ulong_t start, compat_ulong_t len,
681 compat_ulong_t mode, compat_ulong_t __user *nmask,
682 compat_ulong_t maxnode, compat_ulong_t flags)
684 long err = 0;
685 unsigned long __user *nm = NULL;
686 unsigned long nr_bits, alloc_size;
687 nodemask_t bm;
689 nr_bits = min_t(unsigned long, maxnode-1, MAX_NUMNODES);
690 alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8;
692 if (nmask) {
693 err = compat_get_bitmap(nodes_addr(bm), nmask, nr_bits);
694 nm = compat_alloc_user_space(alloc_size);
695 err |= copy_to_user(nm, nodes_addr(bm), alloc_size);
698 if (err)
699 return -EFAULT;
701 return sys_mbind(start, len, mode, nm, nr_bits+1, flags);
704 #endif
706 /* Return effective policy for a VMA */
707 struct mempolicy *
708 get_vma_policy(struct task_struct *task, struct vm_area_struct *vma, unsigned long addr)
710 struct mempolicy *pol = task->mempolicy;
712 if (vma) {
713 if (vma->vm_ops && vma->vm_ops->get_policy)
714 pol = vma->vm_ops->get_policy(vma, addr);
715 else if (vma->vm_policy &&
716 vma->vm_policy->policy != MPOL_DEFAULT)
717 pol = vma->vm_policy;
719 if (!pol)
720 pol = &default_policy;
721 return pol;
724 /* Return a zonelist representing a mempolicy */
725 static struct zonelist *zonelist_policy(gfp_t gfp, struct mempolicy *policy)
727 int nd;
729 switch (policy->policy) {
730 case MPOL_PREFERRED:
731 nd = policy->v.preferred_node;
732 if (nd < 0)
733 nd = numa_node_id();
734 break;
735 case MPOL_BIND:
736 /* Lower zones don't get a policy applied */
737 /* Careful: current->mems_allowed might have moved */
738 if (gfp_zone(gfp) >= policy_zone)
739 if (cpuset_zonelist_valid_mems_allowed(policy->v.zonelist))
740 return policy->v.zonelist;
741 /*FALL THROUGH*/
742 case MPOL_INTERLEAVE: /* should not happen */
743 case MPOL_DEFAULT:
744 nd = numa_node_id();
745 break;
746 default:
747 nd = 0;
748 BUG();
750 return NODE_DATA(nd)->node_zonelists + gfp_zone(gfp);
753 /* Do dynamic interleaving for a process */
754 static unsigned interleave_nodes(struct mempolicy *policy)
756 unsigned nid, next;
757 struct task_struct *me = current;
759 nid = me->il_next;
760 next = next_node(nid, policy->v.nodes);
761 if (next >= MAX_NUMNODES)
762 next = first_node(policy->v.nodes);
763 me->il_next = next;
764 return nid;
767 /* Do static interleaving for a VMA with known offset. */
768 static unsigned offset_il_node(struct mempolicy *pol,
769 struct vm_area_struct *vma, unsigned long off)
771 unsigned nnodes = nodes_weight(pol->v.nodes);
772 unsigned target = (unsigned)off % nnodes;
773 int c;
774 int nid = -1;
776 c = 0;
777 do {
778 nid = next_node(nid, pol->v.nodes);
779 c++;
780 } while (c <= target);
781 return nid;
784 /* Allocate a page in interleaved policy.
785 Own path because it needs to do special accounting. */
786 static struct page *alloc_page_interleave(gfp_t gfp, unsigned order,
787 unsigned nid)
789 struct zonelist *zl;
790 struct page *page;
792 zl = NODE_DATA(nid)->node_zonelists + gfp_zone(gfp);
793 page = __alloc_pages(gfp, order, zl);
794 if (page && page_zone(page) == zl->zones[0]) {
795 zone_pcp(zl->zones[0],get_cpu())->interleave_hit++;
796 put_cpu();
798 return page;
802 * alloc_page_vma - Allocate a page for a VMA.
804 * @gfp:
805 * %GFP_USER user allocation.
806 * %GFP_KERNEL kernel allocations,
807 * %GFP_HIGHMEM highmem/user allocations,
808 * %GFP_FS allocation should not call back into a file system.
809 * %GFP_ATOMIC don't sleep.
811 * @vma: Pointer to VMA or NULL if not available.
812 * @addr: Virtual Address of the allocation. Must be inside the VMA.
814 * This function allocates a page from the kernel page pool and applies
815 * a NUMA policy associated with the VMA or the current process.
816 * When VMA is not NULL caller must hold down_read on the mmap_sem of the
817 * mm_struct of the VMA to prevent it from going away. Should be used for
818 * all allocations for pages that will be mapped into
819 * user space. Returns NULL when no page can be allocated.
821 * Should be called with the mm_sem of the vma hold.
823 struct page *
824 alloc_page_vma(gfp_t gfp, struct vm_area_struct *vma, unsigned long addr)
826 struct mempolicy *pol = get_vma_policy(current, vma, addr);
828 cpuset_update_current_mems_allowed();
830 if (unlikely(pol->policy == MPOL_INTERLEAVE)) {
831 unsigned nid;
832 if (vma) {
833 unsigned long off;
834 off = vma->vm_pgoff;
835 off += (addr - vma->vm_start) >> PAGE_SHIFT;
836 nid = offset_il_node(pol, vma, off);
837 } else {
838 /* fall back to process interleaving */
839 nid = interleave_nodes(pol);
841 return alloc_page_interleave(gfp, 0, nid);
843 return __alloc_pages(gfp, 0, zonelist_policy(gfp, pol));
847 * alloc_pages_current - Allocate pages.
849 * @gfp:
850 * %GFP_USER user allocation,
851 * %GFP_KERNEL kernel allocation,
852 * %GFP_HIGHMEM highmem allocation,
853 * %GFP_FS don't call back into a file system.
854 * %GFP_ATOMIC don't sleep.
855 * @order: Power of two of allocation size in pages. 0 is a single page.
857 * Allocate a page from the kernel page pool. When not in
858 * interrupt context and apply the current process NUMA policy.
859 * Returns NULL when no page can be allocated.
861 * Don't call cpuset_update_current_mems_allowed() unless
862 * 1) it's ok to take cpuset_sem (can WAIT), and
863 * 2) allocating for current task (not interrupt).
865 struct page *alloc_pages_current(gfp_t gfp, unsigned order)
867 struct mempolicy *pol = current->mempolicy;
869 if ((gfp & __GFP_WAIT) && !in_interrupt())
870 cpuset_update_current_mems_allowed();
871 if (!pol || in_interrupt())
872 pol = &default_policy;
873 if (pol->policy == MPOL_INTERLEAVE)
874 return alloc_page_interleave(gfp, order, interleave_nodes(pol));
875 return __alloc_pages(gfp, order, zonelist_policy(gfp, pol));
877 EXPORT_SYMBOL(alloc_pages_current);
879 /* Slow path of a mempolicy copy */
880 struct mempolicy *__mpol_copy(struct mempolicy *old)
882 struct mempolicy *new = kmem_cache_alloc(policy_cache, GFP_KERNEL);
884 if (!new)
885 return ERR_PTR(-ENOMEM);
886 *new = *old;
887 atomic_set(&new->refcnt, 1);
888 if (new->policy == MPOL_BIND) {
889 int sz = ksize(old->v.zonelist);
890 new->v.zonelist = kmalloc(sz, SLAB_KERNEL);
891 if (!new->v.zonelist) {
892 kmem_cache_free(policy_cache, new);
893 return ERR_PTR(-ENOMEM);
895 memcpy(new->v.zonelist, old->v.zonelist, sz);
897 return new;
900 /* Slow path of a mempolicy comparison */
901 int __mpol_equal(struct mempolicy *a, struct mempolicy *b)
903 if (!a || !b)
904 return 0;
905 if (a->policy != b->policy)
906 return 0;
907 switch (a->policy) {
908 case MPOL_DEFAULT:
909 return 1;
910 case MPOL_INTERLEAVE:
911 return nodes_equal(a->v.nodes, b->v.nodes);
912 case MPOL_PREFERRED:
913 return a->v.preferred_node == b->v.preferred_node;
914 case MPOL_BIND: {
915 int i;
916 for (i = 0; a->v.zonelist->zones[i]; i++)
917 if (a->v.zonelist->zones[i] != b->v.zonelist->zones[i])
918 return 0;
919 return b->v.zonelist->zones[i] == NULL;
921 default:
922 BUG();
923 return 0;
927 /* Slow path of a mpol destructor. */
928 void __mpol_free(struct mempolicy *p)
930 if (!atomic_dec_and_test(&p->refcnt))
931 return;
932 if (p->policy == MPOL_BIND)
933 kfree(p->v.zonelist);
934 p->policy = MPOL_DEFAULT;
935 kmem_cache_free(policy_cache, p);
939 * Hugetlb policy. Same as above, just works with node numbers instead of
940 * zonelists.
943 /* Find first node suitable for an allocation */
944 int mpol_first_node(struct vm_area_struct *vma, unsigned long addr)
946 struct mempolicy *pol = get_vma_policy(current, vma, addr);
948 switch (pol->policy) {
949 case MPOL_DEFAULT:
950 return numa_node_id();
951 case MPOL_BIND:
952 return pol->v.zonelist->zones[0]->zone_pgdat->node_id;
953 case MPOL_INTERLEAVE:
954 return interleave_nodes(pol);
955 case MPOL_PREFERRED:
956 return pol->v.preferred_node >= 0 ?
957 pol->v.preferred_node : numa_node_id();
959 BUG();
960 return 0;
963 /* Find secondary valid nodes for an allocation */
964 int mpol_node_valid(int nid, struct vm_area_struct *vma, unsigned long addr)
966 struct mempolicy *pol = get_vma_policy(current, vma, addr);
968 switch (pol->policy) {
969 case MPOL_PREFERRED:
970 case MPOL_DEFAULT:
971 case MPOL_INTERLEAVE:
972 return 1;
973 case MPOL_BIND: {
974 struct zone **z;
975 for (z = pol->v.zonelist->zones; *z; z++)
976 if ((*z)->zone_pgdat->node_id == nid)
977 return 1;
978 return 0;
980 default:
981 BUG();
982 return 0;
987 * Shared memory backing store policy support.
989 * Remember policies even when nobody has shared memory mapped.
990 * The policies are kept in Red-Black tree linked from the inode.
991 * They are protected by the sp->lock spinlock, which should be held
992 * for any accesses to the tree.
995 /* lookup first element intersecting start-end */
996 /* Caller holds sp->lock */
997 static struct sp_node *
998 sp_lookup(struct shared_policy *sp, unsigned long start, unsigned long end)
1000 struct rb_node *n = sp->root.rb_node;
1002 while (n) {
1003 struct sp_node *p = rb_entry(n, struct sp_node, nd);
1005 if (start >= p->end)
1006 n = n->rb_right;
1007 else if (end <= p->start)
1008 n = n->rb_left;
1009 else
1010 break;
1012 if (!n)
1013 return NULL;
1014 for (;;) {
1015 struct sp_node *w = NULL;
1016 struct rb_node *prev = rb_prev(n);
1017 if (!prev)
1018 break;
1019 w = rb_entry(prev, struct sp_node, nd);
1020 if (w->end <= start)
1021 break;
1022 n = prev;
1024 return rb_entry(n, struct sp_node, nd);
1027 /* Insert a new shared policy into the list. */
1028 /* Caller holds sp->lock */
1029 static void sp_insert(struct shared_policy *sp, struct sp_node *new)
1031 struct rb_node **p = &sp->root.rb_node;
1032 struct rb_node *parent = NULL;
1033 struct sp_node *nd;
1035 while (*p) {
1036 parent = *p;
1037 nd = rb_entry(parent, struct sp_node, nd);
1038 if (new->start < nd->start)
1039 p = &(*p)->rb_left;
1040 else if (new->end > nd->end)
1041 p = &(*p)->rb_right;
1042 else
1043 BUG();
1045 rb_link_node(&new->nd, parent, p);
1046 rb_insert_color(&new->nd, &sp->root);
1047 PDprintk("inserting %lx-%lx: %d\n", new->start, new->end,
1048 new->policy ? new->policy->policy : 0);
1051 /* Find shared policy intersecting idx */
1052 struct mempolicy *
1053 mpol_shared_policy_lookup(struct shared_policy *sp, unsigned long idx)
1055 struct mempolicy *pol = NULL;
1056 struct sp_node *sn;
1058 if (!sp->root.rb_node)
1059 return NULL;
1060 spin_lock(&sp->lock);
1061 sn = sp_lookup(sp, idx, idx+1);
1062 if (sn) {
1063 mpol_get(sn->policy);
1064 pol = sn->policy;
1066 spin_unlock(&sp->lock);
1067 return pol;
1070 static void sp_delete(struct shared_policy *sp, struct sp_node *n)
1072 PDprintk("deleting %lx-l%x\n", n->start, n->end);
1073 rb_erase(&n->nd, &sp->root);
1074 mpol_free(n->policy);
1075 kmem_cache_free(sn_cache, n);
1078 struct sp_node *
1079 sp_alloc(unsigned long start, unsigned long end, struct mempolicy *pol)
1081 struct sp_node *n = kmem_cache_alloc(sn_cache, GFP_KERNEL);
1083 if (!n)
1084 return NULL;
1085 n->start = start;
1086 n->end = end;
1087 mpol_get(pol);
1088 n->policy = pol;
1089 return n;
1092 /* Replace a policy range. */
1093 static int shared_policy_replace(struct shared_policy *sp, unsigned long start,
1094 unsigned long end, struct sp_node *new)
1096 struct sp_node *n, *new2 = NULL;
1098 restart:
1099 spin_lock(&sp->lock);
1100 n = sp_lookup(sp, start, end);
1101 /* Take care of old policies in the same range. */
1102 while (n && n->start < end) {
1103 struct rb_node *next = rb_next(&n->nd);
1104 if (n->start >= start) {
1105 if (n->end <= end)
1106 sp_delete(sp, n);
1107 else
1108 n->start = end;
1109 } else {
1110 /* Old policy spanning whole new range. */
1111 if (n->end > end) {
1112 if (!new2) {
1113 spin_unlock(&sp->lock);
1114 new2 = sp_alloc(end, n->end, n->policy);
1115 if (!new2)
1116 return -ENOMEM;
1117 goto restart;
1119 n->end = start;
1120 sp_insert(sp, new2);
1121 new2 = NULL;
1122 break;
1123 } else
1124 n->end = start;
1126 if (!next)
1127 break;
1128 n = rb_entry(next, struct sp_node, nd);
1130 if (new)
1131 sp_insert(sp, new);
1132 spin_unlock(&sp->lock);
1133 if (new2) {
1134 mpol_free(new2->policy);
1135 kmem_cache_free(sn_cache, new2);
1137 return 0;
1140 int mpol_set_shared_policy(struct shared_policy *info,
1141 struct vm_area_struct *vma, struct mempolicy *npol)
1143 int err;
1144 struct sp_node *new = NULL;
1145 unsigned long sz = vma_pages(vma);
1147 PDprintk("set_shared_policy %lx sz %lu %d %lx\n",
1148 vma->vm_pgoff,
1149 sz, npol? npol->policy : -1,
1150 npol ? nodes_addr(npol->v.nodes)[0] : -1);
1152 if (npol) {
1153 new = sp_alloc(vma->vm_pgoff, vma->vm_pgoff + sz, npol);
1154 if (!new)
1155 return -ENOMEM;
1157 err = shared_policy_replace(info, vma->vm_pgoff, vma->vm_pgoff+sz, new);
1158 if (err && new)
1159 kmem_cache_free(sn_cache, new);
1160 return err;
1163 /* Free a backing policy store on inode delete. */
1164 void mpol_free_shared_policy(struct shared_policy *p)
1166 struct sp_node *n;
1167 struct rb_node *next;
1169 if (!p->root.rb_node)
1170 return;
1171 spin_lock(&p->lock);
1172 next = rb_first(&p->root);
1173 while (next) {
1174 n = rb_entry(next, struct sp_node, nd);
1175 next = rb_next(&n->nd);
1176 rb_erase(&n->nd, &p->root);
1177 mpol_free(n->policy);
1178 kmem_cache_free(sn_cache, n);
1180 spin_unlock(&p->lock);
1183 /* assumes fs == KERNEL_DS */
1184 void __init numa_policy_init(void)
1186 policy_cache = kmem_cache_create("numa_policy",
1187 sizeof(struct mempolicy),
1188 0, SLAB_PANIC, NULL, NULL);
1190 sn_cache = kmem_cache_create("shared_policy_node",
1191 sizeof(struct sp_node),
1192 0, SLAB_PANIC, NULL, NULL);
1194 /* Set interleaving policy for system init. This way not all
1195 the data structures allocated at system boot end up in node zero. */
1197 if (do_set_mempolicy(MPOL_INTERLEAVE, &node_online_map))
1198 printk("numa_policy_init: interleaving failed\n");
1201 /* Reset policy of current process to default */
1202 void numa_default_policy(void)
1204 do_set_mempolicy(MPOL_DEFAULT, NULL);
1207 /* Migrate a policy to a different set of nodes */
1208 static void rebind_policy(struct mempolicy *pol, const nodemask_t *old,
1209 const nodemask_t *new)
1211 nodemask_t tmp;
1213 if (!pol)
1214 return;
1216 switch (pol->policy) {
1217 case MPOL_DEFAULT:
1218 break;
1219 case MPOL_INTERLEAVE:
1220 nodes_remap(tmp, pol->v.nodes, *old, *new);
1221 pol->v.nodes = tmp;
1222 current->il_next = node_remap(current->il_next, *old, *new);
1223 break;
1224 case MPOL_PREFERRED:
1225 pol->v.preferred_node = node_remap(pol->v.preferred_node,
1226 *old, *new);
1227 break;
1228 case MPOL_BIND: {
1229 nodemask_t nodes;
1230 struct zone **z;
1231 struct zonelist *zonelist;
1233 nodes_clear(nodes);
1234 for (z = pol->v.zonelist->zones; *z; z++)
1235 node_set((*z)->zone_pgdat->node_id, nodes);
1236 nodes_remap(tmp, nodes, *old, *new);
1237 nodes = tmp;
1239 zonelist = bind_zonelist(&nodes);
1241 /* If no mem, then zonelist is NULL and we keep old zonelist.
1242 * If that old zonelist has no remaining mems_allowed nodes,
1243 * then zonelist_policy() will "FALL THROUGH" to MPOL_DEFAULT.
1246 if (zonelist) {
1247 /* Good - got mem - substitute new zonelist */
1248 kfree(pol->v.zonelist);
1249 pol->v.zonelist = zonelist;
1251 break;
1253 default:
1254 BUG();
1255 break;
1260 * Someone moved this task to different nodes. Fixup mempolicies.
1262 * TODO - fixup current->mm->vma and shmfs/tmpfs/hugetlbfs policies as well,
1263 * once we have a cpuset mechanism to mark which cpuset subtree is migrating.
1265 void numa_policy_rebind(const nodemask_t *old, const nodemask_t *new)
1267 rebind_policy(current->mempolicy, old, new);