mm/mempolicy.c

   1 /*
   2  * Simple NUMA memory policy for the Linux kernel.
   3  *
   4  * Copyright 2003,2004 Andi Kleen, SuSE Labs.
   5  * (C) Copyright 2005 Christoph Lameter, Silicon Graphics, Inc.
   6  * Subject to the GNU Public License, version 2.
   7  *
   8  * NUMA policy allows the user to give hints in which node(s) memory should
   9  * be allocated.
  10  *
  11  * Support four policies per VMA and per process:
  12  *
  13  * The VMA policy has priority over the process policy for a page fault.
  14  *
  15  * interleave     Allocate memory interleaved over a set of nodes,
  16  *                with normal fallback if it fails.
  17  *                For VMA based allocations this interleaves based on the
  18  *                offset into the backing object or offset into the mapping
  19  *                for anonymous memory. For process policy an process counter
  20  *                is used.
  21  *
  22  * bind           Only allocate memory on a specific set of nodes,
  23  *                no fallback.
  24  *                FIXME: memory is allocated starting with the first node
  25  *                to the last. It would be better if bind would truly restrict
  26  *                the allocation to memory nodes instead
  27  *
  28  * preferred       Try a specific node first before normal fallback.
  29  *                As a special case node -1 here means do the allocation
  30  *                on the local CPU. This is normally identical to default,
  31  *                but useful to set in a VMA when you have a non default
  32  *                process policy.
  33  *
  34  * default        Allocate on the local node first, or when on a VMA
  35  *                use the process policy. This is what Linux always did
  36  *                in a NUMA aware kernel and still does by, ahem, default.
  37  *
  38  * The process policy is applied for most non interrupt memory allocations
  39  * in that process' context. Interrupts ignore the policies and always
  40  * try to allocate on the local CPU. The VMA policy is only applied for memory
  41  * allocations for a VMA in the VM.
  42  *
  43  * Currently there are a few corner cases in swapping where the policy
  44  * is not applied, but the majority should be handled. When process policy
  45  * is used it is not remembered over swap outs/swap ins.
  46  *
  47  * Only the highest zone in the zone hierarchy gets policied. Allocations
  48  * requesting a lower zone just use default policy. This implies that
  49  * on systems with highmem kernel lowmem allocation don't get policied.
  50  * Same with GFP_DMA allocations.
  51  *
  52  * For shmfs/tmpfs/hugetlbfs shared memory the policy is shared between
  53  * all users and remembered even when nobody has memory mapped.
  54  */
  55
  56 /* Notebook:
  57    fix mmap readahead to honour policy and enable policy for any page cache
  58    object
  59    statistics for bigpages
  60    global policy for page cache? currently it uses process policy. Requires
  61    first item above.
  62    handle mremap for shared memory (currently ignored for the policy)
  63    grows down?
  64    make bind policy root only? It can trigger oom much faster and the
  65    kernel is not always grateful with that.
  66 */
  67
  68 #include <linux/mempolicy.h>
  69 #include <linux/mm.h>
  70 #include <linux/highmem.h>
  71 #include <linux/hugetlb.h>
  72 #include <linux/kernel.h>
  73 #include <linux/sched.h>
  74 #include <linux/nodemask.h>
  75 #include <linux/cpuset.h>
  76 #include <linux/gfp.h>
  77 #include <linux/slab.h>
  78 #include <linux/string.h>
  79 #include <linux/module.h>
  80 #include <linux/nsproxy.h>
  81 #include <linux/interrupt.h>
  82 #include <linux/init.h>
  83 #include <linux/compat.h>
  84 #include <linux/swap.h>
  85 #include <linux/seq_file.h>
  86 #include <linux/proc_fs.h>
  87 #include <linux/migrate.h>
  88 #include <linux/rmap.h>
  89 #include <linux/security.h>
  90 #include <linux/syscalls.h>
  91 #include <linux/ctype.h>
  92
  93 #include <asm/tlbflush.h>
  94 #include <asm/uaccess.h>
  95
  96 /* Internal flags */
  97 #define MPOL_MF_DISCONTIG_OK (MPOL_MF_INTERNAL << 0)    /* Skip checks for continuous vmas */
  98 #define MPOL_MF_INVERT (MPOL_MF_INTERNAL << 1)          /* Invert check for nodemask */
  99 #define MPOL_MF_STATS (MPOL_MF_INTERNAL << 2)           /* Gather statistics */
 100
 101 static struct kmem_cache *policy_cache;
 102 static struct kmem_cache *sn_cache;
 103
 104 /* Highest zone. An specific allocation for a zone below that is not
 105    policied. */
 106 enum zone_type policy_zone = 0;
 107
 108 /*
 109  * run-time system-wide default policy => local allocation
 110  */
 111 struct mempolicy default_policy = {
 112         .refcnt = ATOMIC_INIT(1), /* never free it */
 113         .mode = MPOL_PREFERRED,
 114         .flags = MPOL_F_LOCAL,
 115 };
 116
 117 static const struct mempolicy_operations {
 118         int (*create)(struct mempolicy *pol, const nodemask_t *nodes);
 119         void (*rebind)(struct mempolicy *pol, const nodemask_t *nodes);
 120 } mpol_ops[MPOL_MAX];
 121
 122 /* Check that the nodemask contains at least one populated zone */
 123 static int is_valid_nodemask(const nodemask_t *nodemask)
 124 {
 125         int nd, k;
 126
 127         /* Check that there is something useful in this mask */
 128         k = policy_zone;
 129
 130         for_each_node_mask(nd, *nodemask) {
 131                 struct zone *z;
 132
 133                 for (k = 0; k <= policy_zone; k++) {
 134                         z = &NODE_DATA(nd)->node_zones[k];
 135                         if (z->present_pages > 0)
 136                                 return 1;
 137                 }
 138         }
 139
 140         return 0;
 141 }
 142
 143 static inline int mpol_store_user_nodemask(const struct mempolicy *pol)
 144 {
 145         return pol->flags & (MPOL_F_STATIC_NODES | MPOL_F_RELATIVE_NODES);
 146 }
 147
 148 static void mpol_relative_nodemask(nodemask_t *ret, const nodemask_t *orig,
 149                                    const nodemask_t *rel)
 150 {
 151         nodemask_t tmp;
 152         nodes_fold(tmp, *orig, nodes_weight(*rel));
 153         nodes_onto(*ret, tmp, *rel);
 154 }
 155
 156 static int mpol_new_interleave(struct mempolicy *pol, const nodemask_t *nodes)
 157 {
 158         if (nodes_empty(*nodes))
 159                 return -EINVAL;
 160         pol->v.nodes = *nodes;
 161         return 0;
 162 }
 163
 164 static int mpol_new_preferred(struct mempolicy *pol, const nodemask_t *nodes)
 165 {
 166         if (!nodes)
 167                 pol->flags |= MPOL_F_LOCAL;     /* local allocation */
 168         else if (nodes_empty(*nodes))
 169                 return -EINVAL;                 /*  no allowed nodes */
 170         else
 171                 pol->v.preferred_node = first_node(*nodes);
 172         return 0;
 173 }
 174
 175 static int mpol_new_bind(struct mempolicy *pol, const nodemask_t *nodes)
 176 {
 177         if (!is_valid_nodemask(nodes))
 178                 return -EINVAL;
 179         pol->v.nodes = *nodes;
 180         return 0;
 181 }
 182
 183 /* Create a new policy */
 184 static struct mempolicy *mpol_new(unsigned short mode, unsigned short flags,
 185                                   nodemask_t *nodes)
 186 {
 187         struct mempolicy *policy;
 188         nodemask_t cpuset_context_nmask;
 189         int ret;
 190
 191         pr_debug("setting mode %d flags %d nodes[0] %lx\n",
 192                  mode, flags, nodes ? nodes_addr(*nodes)[0] : -1);
 193
 194         if (mode == MPOL_DEFAULT) {
 195                 if (nodes && !nodes_empty(*nodes))
 196                         return ERR_PTR(-EINVAL);
 197                 return NULL;    /* simply delete any existing policy */
 198         }
 199         VM_BUG_ON(!nodes);
 200
 201         /*
 202          * MPOL_PREFERRED cannot be used with MPOL_F_STATIC_NODES or
 203          * MPOL_F_RELATIVE_NODES if the nodemask is empty (local allocation).
 204          * All other modes require a valid pointer to a non-empty nodemask.
 205          */
 206         if (mode == MPOL_PREFERRED) {
 207                 if (nodes_empty(*nodes)) {
 208                         if (((flags & MPOL_F_STATIC_NODES) ||
 209                              (flags & MPOL_F_RELATIVE_NODES)))
 210                                 return ERR_PTR(-EINVAL);
 211                         nodes = NULL;   /* flag local alloc */
 212                 }
 213         } else if (nodes_empty(*nodes))
 214                 return ERR_PTR(-EINVAL);
 215         policy = kmem_cache_alloc(policy_cache, GFP_KERNEL);
 216         if (!policy)
 217                 return ERR_PTR(-ENOMEM);
 218         atomic_set(&policy->refcnt, 1);
 219         policy->mode = mode;
 220         policy->flags = flags;
 221
 222         if (nodes) {
 223                 /*
 224                  * cpuset related setup doesn't apply to local allocation
 225                  */
 226                 cpuset_update_task_memory_state();
 227                 if (flags & MPOL_F_RELATIVE_NODES)
 228                         mpol_relative_nodemask(&cpuset_context_nmask, nodes,
 229                                                &cpuset_current_mems_allowed);
 230                 else
 231                         nodes_and(cpuset_context_nmask, *nodes,
 232                                   cpuset_current_mems_allowed);
 233                 if (mpol_store_user_nodemask(policy))
 234                         policy->w.user_nodemask = *nodes;
 235                 else
 236                         policy->w.cpuset_mems_allowed =
 237                                                 cpuset_mems_allowed(current);
 238         }
 239
 240         ret = mpol_ops[mode].create(policy,
 241                                 nodes ? &cpuset_context_nmask : NULL);
 242         if (ret < 0) {
 243                 kmem_cache_free(policy_cache, policy);
 244                 return ERR_PTR(ret);
 245         }
 246         return policy;
 247 }
 248
 249 /* Slow path of a mpol destructor. */
 250 void __mpol_put(struct mempolicy *p)
 251 {
 252         if (!atomic_dec_and_test(&p->refcnt))
 253                 return;
 254         kmem_cache_free(policy_cache, p);
 255 }
 256
 257 static void mpol_rebind_default(struct mempolicy *pol, const nodemask_t *nodes)
 258 {
 259 }
 260
 261 static void mpol_rebind_nodemask(struct mempolicy *pol,
 262                                  const nodemask_t *nodes)
 263 {
 264         nodemask_t tmp;
 265
 266         if (pol->flags & MPOL_F_STATIC_NODES)
 267                 nodes_and(tmp, pol->w.user_nodemask, *nodes);
 268         else if (pol->flags & MPOL_F_RELATIVE_NODES)
 269                 mpol_relative_nodemask(&tmp, &pol->w.user_nodemask, nodes);
 270         else {
 271                 nodes_remap(tmp, pol->v.nodes, pol->w.cpuset_mems_allowed,
 272                             *nodes);
 273                 pol->w.cpuset_mems_allowed = *nodes;
 274         }
 275
 276         pol->v.nodes = tmp;
 277         if (!node_isset(current->il_next, tmp)) {
 278                 current->il_next = next_node(current->il_next, tmp);
 279                 if (current->il_next >= MAX_NUMNODES)
 280                         current->il_next = first_node(tmp);
 281                 if (current->il_next >= MAX_NUMNODES)
 282                         current->il_next = numa_node_id();
 283         }
 284 }
 285
 286 static void mpol_rebind_preferred(struct mempolicy *pol,
 287                                   const nodemask_t *nodes)
 288 {
 289         nodemask_t tmp;
 290
 291         if (pol->flags & MPOL_F_STATIC_NODES) {
 292                 int node = first_node(pol->w.user_nodemask);
 293
 294                 if (node_isset(node, *nodes)) {
 295                         pol->v.preferred_node = node;
 296                         pol->flags &= ~MPOL_F_LOCAL;
 297                 } else
 298                         pol->flags |= MPOL_F_LOCAL;
 299         } else if (pol->flags & MPOL_F_RELATIVE_NODES) {
 300                 mpol_relative_nodemask(&tmp, &pol->w.user_nodemask, nodes);
 301                 pol->v.preferred_node = first_node(tmp);
 302         } else if (!(pol->flags & MPOL_F_LOCAL)) {
 303                 pol->v.preferred_node = node_remap(pol->v.preferred_node,
 304                                                    pol->w.cpuset_mems_allowed,
 305                                                    *nodes);
 306                 pol->w.cpuset_mems_allowed = *nodes;
 307         }
 308 }
 309
 310 /* Migrate a policy to a different set of nodes */
 311 static void mpol_rebind_policy(struct mempolicy *pol,
 312                                const nodemask_t *newmask)
 313 {
 314         if (!pol)
 315                 return;
 316         if (!mpol_store_user_nodemask(pol) &&
 317             nodes_equal(pol->w.cpuset_mems_allowed, *newmask))
 318                 return;
 319         mpol_ops[pol->mode].rebind(pol, newmask);
 320 }
 321
 322 /*
 323  * Wrapper for mpol_rebind_policy() that just requires task
 324  * pointer, and updates task mempolicy.
 325  */
 326
 327 void mpol_rebind_task(struct task_struct *tsk, const nodemask_t *new)
 328 {
 329         mpol_rebind_policy(tsk->mempolicy, new);
 330 }
 331
 332 /*
 333  * Rebind each vma in mm to new nodemask.
 334  *
 335  * Call holding a reference to mm.  Takes mm->mmap_sem during call.
 336  */
 337
 338 void mpol_rebind_mm(struct mm_struct *mm, nodemask_t *new)
 339 {
 340         struct vm_area_struct *vma;
 341
 342         down_write(&mm->mmap_sem);
 343         for (vma = mm->mmap; vma; vma = vma->vm_next)
 344                 mpol_rebind_policy(vma->vm_policy, new);
 345         up_write(&mm->mmap_sem);
 346 }
 347
 348 static const struct mempolicy_operations mpol_ops[MPOL_MAX] = {
 349         [MPOL_DEFAULT] = {
 350                 .rebind = mpol_rebind_default,
 351         },
 352         [MPOL_INTERLEAVE] = {
 353                 .create = mpol_new_interleave,
 354                 .rebind = mpol_rebind_nodemask,
 355         },
 356         [MPOL_PREFERRED] = {
 357                 .create = mpol_new_preferred,
 358                 .rebind = mpol_rebind_preferred,
 359         },
 360         [MPOL_BIND] = {
 361                 .create = mpol_new_bind,
 362                 .rebind = mpol_rebind_nodemask,
 363         },
 364 };
 365
 366 static void gather_stats(struct page *, void *, int pte_dirty);
 367 static void migrate_page_add(struct page *page, struct list_head *pagelist,
 368                                 unsigned long flags);
 369
 370 /* Scan through pages checking if pages follow certain conditions. */
 371 static int check_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
 372                 unsigned long addr, unsigned long end,
 373                 const nodemask_t *nodes, unsigned long flags,
 374                 void *private)
 375 {
 376         pte_t *orig_pte;
 377         pte_t *pte;
 378         spinlock_t *ptl;
 379
 380         orig_pte = pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
 381         do {
 382                 struct page *page;
 383                 int nid;
 384
 385                 if (!pte_present(*pte))
 386                         continue;
 387                 page = vm_normal_page(vma, addr, *pte);
 388                 if (!page)
 389                         continue;
 390                 /*
 391                  * The check for PageReserved here is important to avoid
 392                  * handling zero pages and other pages that may have been
 393                  * marked special by the system.
 394                  *
 395                  * If the PageReserved would not be checked here then f.e.
 396                  * the location of the zero page could have an influence
 397                  * on MPOL_MF_STRICT, zero pages would be counted for
 398                  * the per node stats, and there would be useless attempts
 399                  * to put zero pages on the migration list.
 400                  */
 401                 if (PageReserved(page))
 402                         continue;
 403                 nid = page_to_nid(page);
 404                 if (node_isset(nid, *nodes) == !!(flags & MPOL_MF_INVERT))
 405                         continue;
 406
 407                 if (flags & MPOL_MF_STATS)
 408                         gather_stats(page, private, pte_dirty(*pte));
 409                 else if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL))
 410                         migrate_page_add(page, private, flags);
 411                 else
 412                         break;
 413         } while (pte++, addr += PAGE_SIZE, addr != end);
 414         pte_unmap_unlock(orig_pte, ptl);
 415         return addr != end;
 416 }
 417
 418 static inline int check_pmd_range(struct vm_area_struct *vma, pud_t *pud,
 419                 unsigned long addr, unsigned long end,
 420                 const nodemask_t *nodes, unsigned long flags,
 421                 void *private)
 422 {
 423         pmd_t *pmd;
 424         unsigned long next;
 425
 426         pmd = pmd_offset(pud, addr);
 427         do {
 428                 next = pmd_addr_end(addr, end);
 429                 if (pmd_none_or_clear_bad(pmd))
 430                         continue;
 431                 if (check_pte_range(vma, pmd, addr, next, nodes,
 432                                     flags, private))
 433                         return -EIO;
 434         } while (pmd++, addr = next, addr != end);
 435         return 0;
 436 }
 437
 438 static inline int check_pud_range(struct vm_area_struct *vma, pgd_t *pgd,
 439                 unsigned long addr, unsigned long end,
 440                 const nodemask_t *nodes, unsigned long flags,
 441                 void *private)
 442 {
 443         pud_t *pud;
 444         unsigned long next;
 445
 446         pud = pud_offset(pgd, addr);
 447         do {
 448                 next = pud_addr_end(addr, end);
 449                 if (pud_none_or_clear_bad(pud))
 450                         continue;
 451                 if (check_pmd_range(vma, pud, addr, next, nodes,
 452                                     flags, private))
 453                         return -EIO;
 454         } while (pud++, addr = next, addr != end);
 455         return 0;
 456 }
 457
 458 static inline int check_pgd_range(struct vm_area_struct *vma,
 459                 unsigned long addr, unsigned long end,
 460                 const nodemask_t *nodes, unsigned long flags,
 461                 void *private)
 462 {
 463         pgd_t *pgd;
 464         unsigned long next;
 465
 466         pgd = pgd_offset(vma->vm_mm, addr);
 467         do {
 468                 next = pgd_addr_end(addr, end);
 469                 if (pgd_none_or_clear_bad(pgd))
 470                         continue;
 471                 if (check_pud_range(vma, pgd, addr, next, nodes,
 472                                     flags, private))
 473                         return -EIO;
 474         } while (pgd++, addr = next, addr != end);
 475         return 0;
 476 }
 477
 478 /*
 479  * Check if all pages in a range are on a set of nodes.
 480  * If pagelist != NULL then isolate pages from the LRU and
 481  * put them on the pagelist.
 482  */
 483 static struct vm_area_struct *
 484 check_range(struct mm_struct *mm, unsigned long start, unsigned long end,
 485                 const nodemask_t *nodes, unsigned long flags, void *private)
 486 {
 487         int err;
 488         struct vm_area_struct *first, *vma, *prev;
 489
 490         if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) {
 491
 492                 err = migrate_prep();
 493                 if (err)
 494                         return ERR_PTR(err);
 495         }
 496
 497         first = find_vma(mm, start);
 498         if (!first)
 499                 return ERR_PTR(-EFAULT);
 500         prev = NULL;
 501         for (vma = first; vma && vma->vm_start < end; vma = vma->vm_next) {
 502                 if (!(flags & MPOL_MF_DISCONTIG_OK)) {
 503                         if (!vma->vm_next && vma->vm_end < end)
 504                                 return ERR_PTR(-EFAULT);
 505                         if (prev && prev->vm_end < vma->vm_start)
 506                                 return ERR_PTR(-EFAULT);
 507                 }
 508                 if (!is_vm_hugetlb_page(vma) &&
 509                     ((flags & MPOL_MF_STRICT) ||
 510                      ((flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) &&
 511                                 vma_migratable(vma)))) {
 512                         unsigned long endvma = vma->vm_end;
 513
 514                         if (endvma > end)
 515                                 endvma = end;
 516                         if (vma->vm_start > start)
 517                                 start = vma->vm_start;
 518                         err = check_pgd_range(vma, start, endvma, nodes,
 519                                                 flags, private);
 520                         if (err) {
 521                                 first = ERR_PTR(err);
 522                                 break;
 523                         }
 524                 }
 525                 prev = vma;
 526         }
 527         return first;
 528 }
 529
 530 /* Apply policy to a single VMA */
 531 static int policy_vma(struct vm_area_struct *vma, struct mempolicy *new)
 532 {
 533         int err = 0;
 534         struct mempolicy *old = vma->vm_policy;
 535
 536         pr_debug("vma %lx-%lx/%lx vm_ops %p vm_file %p set_policy %p\n",
 537                  vma->vm_start, vma->vm_end, vma->vm_pgoff,
 538                  vma->vm_ops, vma->vm_file,
 539                  vma->vm_ops ? vma->vm_ops->set_policy : NULL);
 540
 541         if (vma->vm_ops && vma->vm_ops->set_policy)
 542                 err = vma->vm_ops->set_policy(vma, new);
 543         if (!err) {
 544                 mpol_get(new);
 545                 vma->vm_policy = new;
 546                 mpol_put(old);
 547         }
 548         return err;
 549 }
 550
 551 /* Step 2: apply policy to a range and do splits. */
 552 static int mbind_range(struct vm_area_struct *vma, unsigned long start,
 553                        unsigned long end, struct mempolicy *new)
 554 {
 555         struct vm_area_struct *next;
 556         int err;
 557
 558         err = 0;
 559         for (; vma && vma->vm_start < end; vma = next) {
 560                 next = vma->vm_next;
 561                 if (vma->vm_start < start)
 562                         err = split_vma(vma->vm_mm, vma, start, 1);
 563                 if (!err && vma->vm_end > end)
 564                         err = split_vma(vma->vm_mm, vma, end, 0);
 565                 if (!err)
 566                         err = policy_vma(vma, new);
 567                 if (err)
 568                         break;
 569         }
 570         return err;
 571 }
 572
 573 /*
 574  * Update task->flags PF_MEMPOLICY bit: set iff non-default
 575  * mempolicy.  Allows more rapid checking of this (combined perhaps
 576  * with other PF_* flag bits) on memory allocation hot code paths.
 577  *
 578  * If called from outside this file, the task 'p' should -only- be
 579  * a newly forked child not yet visible on the task list, because
 580  * manipulating the task flags of a visible task is not safe.
 581  *
 582  * The above limitation is why this routine has the funny name
 583  * mpol_fix_fork_child_flag().
 584  *
 585  * It is also safe to call this with a task pointer of current,
 586  * which the static wrapper mpol_set_task_struct_flag() does,
 587  * for use within this file.
 588  */
 589
 590 void mpol_fix_fork_child_flag(struct task_struct *p)
 591 {
 592         if (p->mempolicy)
 593                 p->flags |= PF_MEMPOLICY;
 594         else
 595                 p->flags &= ~PF_MEMPOLICY;
 596 }
 597
 598 static void mpol_set_task_struct_flag(void)
 599 {
 600         mpol_fix_fork_child_flag(current);
 601 }
 602
 603 /* Set the process memory policy */
 604 static long do_set_mempolicy(unsigned short mode, unsigned short flags,
 605                              nodemask_t *nodes)
 606 {
 607         struct mempolicy *new;
 608         struct mm_struct *mm = current->mm;
 609
 610         new = mpol_new(mode, flags, nodes);
 611         if (IS_ERR(new))
 612                 return PTR_ERR(new);
 613
 614         /*
 615          * prevent changing our mempolicy while show_numa_maps()
 616          * is using it.
 617          * Note:  do_set_mempolicy() can be called at init time
 618          * with no 'mm'.
 619          */
 620         if (mm)
 621                 down_write(&mm->mmap_sem);
 622         mpol_put(current->mempolicy);
 623         current->mempolicy = new;
 624         mpol_set_task_struct_flag();
 625         if (new && new->mode == MPOL_INTERLEAVE &&
 626             nodes_weight(new->v.nodes))
 627                 current->il_next = first_node(new->v.nodes);
 628         if (mm)
 629                 up_write(&mm->mmap_sem);
 630
 631         return 0;
 632 }
 633
 634 /*
 635  * Return nodemask for policy for get_mempolicy() query
 636  */
 637 static void get_policy_nodemask(struct mempolicy *p, nodemask_t *nodes)
 638 {
 639         nodes_clear(*nodes);
 640         if (p == &default_policy)
 641                 return;
 642
 643         switch (p->mode) {
 644         case MPOL_BIND:
 645                 /* Fall through */
 646         case MPOL_INTERLEAVE:
 647                 *nodes = p->v.nodes;
 648                 break;
 649         case MPOL_PREFERRED:
 650                 if (!(p->flags & MPOL_F_LOCAL))
 651                         node_set(p->v.preferred_node, *nodes);
 652                 /* else return empty node mask for local allocation */
 653                 break;
 654         default:
 655                 BUG();
 656         }
 657 }
 658
 659 static int lookup_node(struct mm_struct *mm, unsigned long addr)
 660 {
 661         struct page *p;
 662         int err;
 663
 664         err = get_user_pages(current, mm, addr & PAGE_MASK, 1, 0, 0, &p, NULL);
 665         if (err >= 0) {
 666                 err = page_to_nid(p);
 667                 put_page(p);
 668         }
 669         return err;
 670 }
 671
 672 /* Retrieve NUMA policy */
 673 static long do_get_mempolicy(int *policy, nodemask_t *nmask,
 674                              unsigned long addr, unsigned long flags)
 675 {
 676         int err;
 677         struct mm_struct *mm = current->mm;
 678         struct vm_area_struct *vma = NULL;
 679         struct mempolicy *pol = current->mempolicy;
 680
 681         cpuset_update_task_memory_state();
 682         if (flags &
 683                 ~(unsigned long)(MPOL_F_NODE|MPOL_F_ADDR|MPOL_F_MEMS_ALLOWED))
 684                 return -EINVAL;
 685
 686         if (flags & MPOL_F_MEMS_ALLOWED) {
 687                 if (flags & (MPOL_F_NODE|MPOL_F_ADDR))
 688                         return -EINVAL;
 689                 *policy = 0;    /* just so it's initialized */
 690                 *nmask  = cpuset_current_mems_allowed;
 691                 return 0;
 692         }
 693
 694         if (flags & MPOL_F_ADDR) {
 695                 /*
 696                  * Do NOT fall back to task policy if the
 697                  * vma/shared policy at addr is NULL.  We
 698                  * want to return MPOL_DEFAULT in this case.
 699                  */
 700                 down_read(&mm->mmap_sem);
 701                 vma = find_vma_intersection(mm, addr, addr+1);
 702                 if (!vma) {
 703                         up_read(&mm->mmap_sem);
 704                         return -EFAULT;
 705                 }
 706                 if (vma->vm_ops && vma->vm_ops->get_policy)
 707                         pol = vma->vm_ops->get_policy(vma, addr);
 708                 else
 709                         pol = vma->vm_policy;
 710         } else if (addr)
 711                 return -EINVAL;
 712
 713         if (!pol)
 714                 pol = &default_policy;  /* indicates default behavior */
 715
 716         if (flags & MPOL_F_NODE) {
 717                 if (flags & MPOL_F_ADDR) {
 718                         err = lookup_node(mm, addr);
 719                         if (err < 0)
 720                                 goto out;
 721                         *policy = err;
 722                 } else if (pol == current->mempolicy &&
 723                                 pol->mode == MPOL_INTERLEAVE) {
 724                         *policy = current->il_next;
 725                 } else {
 726                         err = -EINVAL;
 727                         goto out;
 728                 }
 729         } else {
 730                 *policy = pol == &default_policy ? MPOL_DEFAULT :
 731                                                 pol->mode;
 732                 /*
 733                  * Internal mempolicy flags must be masked off before exposing
 734                  * the policy to userspace.
 735                  */
 736                 *policy |= (pol->flags & MPOL_MODE_FLAGS);
 737         }
 738
 739         if (vma) {
 740                 up_read(&current->mm->mmap_sem);
 741                 vma = NULL;
 742         }
 743
 744         err = 0;
 745         if (nmask)
 746                 get_policy_nodemask(pol, nmask);
 747
 748  out:
 749         mpol_cond_put(pol);
 750         if (vma)
 751                 up_read(&current->mm->mmap_sem);
 752         return err;
 753 }
 754
 755 #ifdef CONFIG_MIGRATION
 756 /*
 757  * page migration
 758  */
 759 static void migrate_page_add(struct page *page, struct list_head *pagelist,
 760                                 unsigned long flags)
 761 {
 762         /*
 763          * Avoid migrating a page that is shared with others.
 764          */
 765         if ((flags & MPOL_MF_MOVE_ALL) || page_mapcount(page) == 1)
 766                 isolate_lru_page(page, pagelist);
 767 }
 768
 769 static struct page *new_node_page(struct page *page, unsigned long node, int **x)
 770 {
 771         return alloc_pages_node(node, GFP_HIGHUSER_MOVABLE, 0);
 772 }
 773
 774 /*
 775  * Migrate pages from one node to a target node.
 776  * Returns error or the number of pages not migrated.
 777  */
 778 static int migrate_to_node(struct mm_struct *mm, int source, int dest,
 779                            int flags)
 780 {
 781         nodemask_t nmask;
 782         LIST_HEAD(pagelist);
 783         int err = 0;
 784
 785         nodes_clear(nmask);
 786         node_set(source, nmask);
 787
 788         check_range(mm, mm->mmap->vm_start, TASK_SIZE, &nmask,
 789                         flags | MPOL_MF_DISCONTIG_OK, &pagelist);
 790
 791         if (!list_empty(&pagelist))
 792                 err = migrate_pages(&pagelist, new_node_page, dest);
 793
 794         return err;
 795 }
 796
 797 /*
 798  * Move pages between the two nodesets so as to preserve the physical
 799  * layout as much as possible.
 800  *
 801  * Returns the number of page that could not be moved.
 802  */
 803 int do_migrate_pages(struct mm_struct *mm,
 804         const nodemask_t *from_nodes, const nodemask_t *to_nodes, int flags)
 805 {
 806         int busy = 0;
 807         int err = 0;
 808         nodemask_t tmp;
 809
 810         down_read(&mm->mmap_sem);
 811
 812         err = migrate_vmas(mm, from_nodes, to_nodes, flags);
 813         if (err)
 814                 goto out;
 815
 816 /*
 817  * Find a 'source' bit set in 'tmp' whose corresponding 'dest'
 818  * bit in 'to' is not also set in 'tmp'.  Clear the found 'source'
 819  * bit in 'tmp', and return that <source, dest> pair for migration.
 820  * The pair of nodemasks 'to' and 'from' define the map.
 821  *
 822  * If no pair of bits is found that way, fallback to picking some
 823  * pair of 'source' and 'dest' bits that are not the same.  If the
 824  * 'source' and 'dest' bits are the same, this represents a node
 825  * that will be migrating to itself, so no pages need move.
 826  *
 827  * If no bits are left in 'tmp', or if all remaining bits left
 828  * in 'tmp' correspond to the same bit in 'to', return false
 829  * (nothing left to migrate).
 830  *
 831  * This lets us pick a pair of nodes to migrate between, such that
 832  * if possible the dest node is not already occupied by some other
 833  * source node, minimizing the risk of overloading the memory on a
 834  * node that would happen if we migrated incoming memory to a node
 835  * before migrating outgoing memory source that same node.
 836  *
 837  * A single scan of tmp is sufficient.  As we go, we remember the
 838  * most recent <s, d> pair that moved (s != d).  If we find a pair
 839  * that not only moved, but what's better, moved to an empty slot
 840  * (d is not set in tmp), then we break out then, with that pair.
 841  * Otherwise when we finish scannng from_tmp, we at least have the
 842  * most recent <s, d> pair that moved.  If we get all the way through
 843  * the scan of tmp without finding any node that moved, much less
 844  * moved to an empty node, then there is nothing left worth migrating.
 845  */
 846
 847         tmp = *from_nodes;
 848         while (!nodes_empty(tmp)) {
 849                 int s,d;
 850                 int source = -1;
 851                 int dest = 0;
 852
 853                 for_each_node_mask(s, tmp) {
 854                         d = node_remap(s, *from_nodes, *to_nodes);
 855                         if (s == d)
 856                                 continue;
 857
 858                         source = s;     /* Node moved. Memorize */
 859                         dest = d;
 860
 861                         /* dest not in remaining from nodes? */
 862                         if (!node_isset(dest, tmp))
 863                                 break;
 864                 }
 865                 if (source == -1)
 866                         break;
 867
 868                 node_clear(source, tmp);
 869                 err = migrate_to_node(mm, source, dest, flags);
 870                 if (err > 0)
 871                         busy += err;
 872                 if (err < 0)
 873                         break;
 874         }
 875 out:
 876         up_read(&mm->mmap_sem);
 877         if (err < 0)
 878                 return err;
 879         return busy;
 880
 881 }
 882
 883 /*
 884  * Allocate a new page for page migration based on vma policy.
 885  * Start assuming that page is mapped by vma pointed to by @private.
 886  * Search forward from there, if not.  N.B., this assumes that the
 887  * list of pages handed to migrate_pages()--which is how we get here--
 888  * is in virtual address order.
 889  */
 890 static struct page *new_vma_page(struct page *page, unsigned long private, int **x)
 891 {
 892         struct vm_area_struct *vma = (struct vm_area_struct *)private;
 893         unsigned long uninitialized_var(address);
 894
 895         while (vma) {
 896                 address = page_address_in_vma(page, vma);
 897                 if (address != -EFAULT)
 898                         break;
 899                 vma = vma->vm_next;
 900         }
 901
 902         /*
 903          * if !vma, alloc_page_vma() will use task or system default policy
 904          */
 905         return alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, address);
 906 }
 907 #else
 908
 909 static void migrate_page_add(struct page *page, struct list_head *pagelist,
 910                                 unsigned long flags)
 911 {
 912 }
 913
 914 int do_migrate_pages(struct mm_struct *mm,
 915         const nodemask_t *from_nodes, const nodemask_t *to_nodes, int flags)
 916 {
 917         return -ENOSYS;
 918 }
 919
 920 static struct page *new_vma_page(struct page *page, unsigned long private, int **x)
 921 {
 922         return NULL;
 923 }
 924 #endif
 925
 926 static long do_mbind(unsigned long start, unsigned long len,
 927                      unsigned short mode, unsigned short mode_flags,
 928                      nodemask_t *nmask, unsigned long flags)
 929 {
 930         struct vm_area_struct *vma;
 931         struct mm_struct *mm = current->mm;
 932         struct mempolicy *new;
 933         unsigned long end;
 934         int err;
 935         LIST_HEAD(pagelist);
 936
 937         if (flags & ~(unsigned long)(MPOL_MF_STRICT |
 938                                      MPOL_MF_MOVE | MPOL_MF_MOVE_ALL))
 939                 return -EINVAL;
 940         if ((flags & MPOL_MF_MOVE_ALL) && !capable(CAP_SYS_NICE))
 941                 return -EPERM;
 942
 943         if (start & ~PAGE_MASK)
 944                 return -EINVAL;
 945
 946         if (mode == MPOL_DEFAULT)
 947                 flags &= ~MPOL_MF_STRICT;
 948
 949         len = (len + PAGE_SIZE - 1) & PAGE_MASK;
 950         end = start + len;
 951
 952         if (end < start)
 953                 return -EINVAL;
 954         if (end == start)
 955                 return 0;
 956
 957         new = mpol_new(mode, mode_flags, nmask);
 958         if (IS_ERR(new))
 959                 return PTR_ERR(new);
 960
 961         /*
 962          * If we are using the default policy then operation
 963          * on discontinuous address spaces is okay after all
 964          */
 965         if (!new)
 966                 flags |= MPOL_MF_DISCONTIG_OK;
 967
 968         pr_debug("mbind %lx-%lx mode:%d flags:%d nodes:%lx\n",
 969                  start, start + len, mode, mode_flags,
 970                  nmask ? nodes_addr(*nmask)[0] : -1);
 971
 972         down_write(&mm->mmap_sem);
 973         vma = check_range(mm, start, end, nmask,
 974                           flags | MPOL_MF_INVERT, &pagelist);
 975
 976         err = PTR_ERR(vma);
 977         if (!IS_ERR(vma)) {
 978                 int nr_failed = 0;
 979
 980                 err = mbind_range(vma, start, end, new);
 981
 982                 if (!list_empty(&pagelist))
 983                         nr_failed = migrate_pages(&pagelist, new_vma_page,
 984                                                 (unsigned long)vma);
 985
 986                 if (!err && nr_failed && (flags & MPOL_MF_STRICT))
 987                         err = -EIO;
 988         } else
 989                 putback_lru_pages(&pagelist);
 990
 991         up_write(&mm->mmap_sem);
 992         mpol_put(new);
 993         return err;
 994 }
 995
 996 /*
 997  * User space interface with variable sized bitmaps for nodelists.
 998  */
 999
1000 /* Copy a node mask from user space. */
1001 static int get_nodes(nodemask_t *nodes, const unsigned long __user *nmask,
1002                      unsigned long maxnode)
1003 {
1004         unsigned long k;
1005         unsigned long nlongs;
1006         unsigned long endmask;
1007
1008         --maxnode;
1009         nodes_clear(*nodes);
1010         if (maxnode == 0 || !nmask)
1011                 return 0;
1012         if (maxnode > PAGE_SIZE*BITS_PER_BYTE)
1013                 return -EINVAL;
1014
1015         nlongs = BITS_TO_LONGS(maxnode);
1016         if ((maxnode % BITS_PER_LONG) == 0)
1017                 endmask = ~0UL;
1018         else
1019                 endmask = (1UL << (maxnode % BITS_PER_LONG)) - 1;
1020
1021         /* When the user specified more nodes than supported just check
1022            if the non supported part is all zero. */
1023         if (nlongs > BITS_TO_LONGS(MAX_NUMNODES)) {
1024                 if (nlongs > PAGE_SIZE/sizeof(long))
1025                         return -EINVAL;
1026                 for (k = BITS_TO_LONGS(MAX_NUMNODES); k < nlongs; k++) {
1027                         unsigned long t;
1028                         if (get_user(t, nmask + k))
1029                                 return -EFAULT;
1030                         if (k == nlongs - 1) {
1031                                 if (t & endmask)
1032                                         return -EINVAL;
1033                         } else if (t)
1034                                 return -EINVAL;
1035                 }
1036                 nlongs = BITS_TO_LONGS(MAX_NUMNODES);
1037                 endmask = ~0UL;
1038         }
1039
1040         if (copy_from_user(nodes_addr(*nodes), nmask, nlongs*sizeof(unsigned long)))
1041                 return -EFAULT;
1042         nodes_addr(*nodes)[nlongs-1] &= endmask;
1043         return 0;
1044 }
1045
1046 /* Copy a kernel node mask to user space */
1047 static int copy_nodes_to_user(unsigned long __user *mask, unsigned long maxnode,
1048                               nodemask_t *nodes)
1049 {
1050         unsigned long copy = ALIGN(maxnode-1, 64) / 8;
1051         const int nbytes = BITS_TO_LONGS(MAX_NUMNODES) * sizeof(long);
1052
1053         if (copy > nbytes) {
1054                 if (copy > PAGE_SIZE)
1055                         return -EINVAL;
1056                 if (clear_user((char __user *)mask + nbytes, copy - nbytes))
1057                         return -EFAULT;
1058                 copy = nbytes;
1059         }
1060         return copy_to_user(mask, nodes_addr(*nodes), copy) ? -EFAULT : 0;
1061 }
1062
1063 SYSCALL_DEFINE6(mbind, unsigned long, start, unsigned long, len,
1064                 unsigned long, mode, unsigned long __user *, nmask,
1065                 unsigned long, maxnode, unsigned, flags)
1066 {
1067         nodemask_t nodes;
1068         int err;
1069         unsigned short mode_flags;
1070
1071         mode_flags = mode & MPOL_MODE_FLAGS;
1072         mode &= ~MPOL_MODE_FLAGS;
1073         if (mode >= MPOL_MAX)
1074                 return -EINVAL;
1075         if ((mode_flags & MPOL_F_STATIC_NODES) &&
1076             (mode_flags & MPOL_F_RELATIVE_NODES))
1077                 return -EINVAL;
1078         err = get_nodes(&nodes, nmask, maxnode);
1079         if (err)
1080                 return err;
1081         return do_mbind(start, len, mode, mode_flags, &nodes, flags);
1082 }
1083
1084 /* Set the process memory policy */
1085 SYSCALL_DEFINE3(set_mempolicy, int, mode, unsigned long __user *, nmask,
1086                 unsigned long, maxnode)
1087 {
1088         int err;
1089         nodemask_t nodes;
1090         unsigned short flags;
1091
1092         flags = mode & MPOL_MODE_FLAGS;
1093         mode &= ~MPOL_MODE_FLAGS;
1094         if ((unsigned int)mode >= MPOL_MAX)
1095                 return -EINVAL;
1096         if ((flags & MPOL_F_STATIC_NODES) && (flags & MPOL_F_RELATIVE_NODES))
1097                 return -EINVAL;
1098         err = get_nodes(&nodes, nmask, maxnode);
1099         if (err)
1100                 return err;
1101         return do_set_mempolicy(mode, flags, &nodes);
1102 }
1103
1104 SYSCALL_DEFINE4(migrate_pages, pid_t, pid, unsigned long, maxnode,
1105                 const unsigned long __user *, old_nodes,
1106                 const unsigned long __user *, new_nodes)
1107 {
1108         struct mm_struct *mm;
1109         struct task_struct *task;
1110         nodemask_t old;
1111         nodemask_t new;
1112         nodemask_t task_nodes;
1113         int err;
1114
1115         err = get_nodes(&old, old_nodes, maxnode);
1116         if (err)
1117                 return err;
1118
1119         err = get_nodes(&new, new_nodes, maxnode);
1120         if (err)
1121                 return err;
1122
1123         /* Find the mm_struct */
1124         read_lock(&tasklist_lock);
1125         task = pid ? find_task_by_vpid(pid) : current;
1126         if (!task) {
1127                 read_unlock(&tasklist_lock);
1128                 return -ESRCH;
1129         }
1130         mm = get_task_mm(task);
1131         read_unlock(&tasklist_lock);
1132
1133         if (!mm)
1134                 return -EINVAL;
1135
1136         /*
1137          * Check if this process has the right to modify the specified
1138          * process. The right exists if the process has administrative
1139          * capabilities, superuser privileges or the same
1140          * userid as the target process.
1141          */
1142         if ((current->euid != task->suid) && (current->euid != task->uid) &&
1143             (current->uid != task->suid) && (current->uid != task->uid) &&
1144             !capable(CAP_SYS_NICE)) {
1145                 err = -EPERM;
1146                 goto out;
1147         }
1148
1149         task_nodes = cpuset_mems_allowed(task);
1150         /* Is the user allowed to access the target nodes? */
1151         if (!nodes_subset(new, task_nodes) && !capable(CAP_SYS_NICE)) {
1152                 err = -EPERM;
1153                 goto out;
1154         }
1155
1156         if (!nodes_subset(new, node_states[N_HIGH_MEMORY])) {
1157                 err = -EINVAL;
1158                 goto out;
1159         }
1160
1161         err = security_task_movememory(task);
1162         if (err)
1163                 goto out;
1164
1165         err = do_migrate_pages(mm, &old, &new,
1166                 capable(CAP_SYS_NICE) ? MPOL_MF_MOVE_ALL : MPOL_MF_MOVE);
1167 out:
1168         mmput(mm);
1169         return err;
1170 }
1171
1172
1173 /* Retrieve NUMA policy */
1174 SYSCALL_DEFINE5(get_mempolicy, int __user *, policy,
1175                 unsigned long __user *, nmask, unsigned long, maxnode,
1176                 unsigned long, addr, unsigned long, flags)
1177 {
1178         int err;
1179         int uninitialized_var(pval);
1180         nodemask_t nodes;
1181
1182         if (nmask != NULL && maxnode < MAX_NUMNODES)
1183                 return -EINVAL;
1184
1185         err = do_get_mempolicy(&pval, &nodes, addr, flags);
1186
1187         if (err)
1188                 return err;
1189
1190         if (policy && put_user(pval, policy))
1191                 return -EFAULT;
1192
1193         if (nmask)
1194                 err = copy_nodes_to_user(nmask, maxnode, &nodes);
1195
1196         return err;
1197 }
1198
1199 #ifdef CONFIG_COMPAT
1200
1201 asmlinkage long compat_sys_get_mempolicy(int __user *policy,
1202                                      compat_ulong_t __user *nmask,
1203                                      compat_ulong_t maxnode,
1204                                      compat_ulong_t addr, compat_ulong_t flags)
1205 {
1206         long err;
1207         unsigned long __user *nm = NULL;
1208         unsigned long nr_bits, alloc_size;
1209         DECLARE_BITMAP(bm, MAX_NUMNODES);
1210
1211         nr_bits = min_t(unsigned long, maxnode-1, MAX_NUMNODES);
1212         alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8;
1213
1214         if (nmask)
1215                 nm = compat_alloc_user_space(alloc_size);
1216
1217         err = sys_get_mempolicy(policy, nm, nr_bits+1, addr, flags);
1218
1219         if (!err && nmask) {
1220                 err = copy_from_user(bm, nm, alloc_size);
1221                 /* ensure entire bitmap is zeroed */
1222                 err |= clear_user(nmask, ALIGN(maxnode-1, 8) / 8);
1223                 err |= compat_put_bitmap(nmask, bm, nr_bits);
1224         }
1225
1226         return err;
1227 }
1228
1229 asmlinkage long compat_sys_set_mempolicy(int mode, compat_ulong_t __user *nmask,
1230                                      compat_ulong_t maxnode)
1231 {
1232         long err = 0;
1233         unsigned long __user *nm = NULL;
1234         unsigned long nr_bits, alloc_size;
1235         DECLARE_BITMAP(bm, MAX_NUMNODES);
1236
1237         nr_bits = min_t(unsigned long, maxnode-1, MAX_NUMNODES);
1238         alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8;
1239
1240         if (nmask) {
1241                 err = compat_get_bitmap(bm, nmask, nr_bits);
1242                 nm = compat_alloc_user_space(alloc_size);
1243                 err |= copy_to_user(nm, bm, alloc_size);
1244         }
1245
1246         if (err)
1247                 return -EFAULT;
1248
1249         return sys_set_mempolicy(mode, nm, nr_bits+1);
1250 }
1251
1252 asmlinkage long compat_sys_mbind(compat_ulong_t start, compat_ulong_t len,
1253                              compat_ulong_t mode, compat_ulong_t __user *nmask,
1254                              compat_ulong_t maxnode, compat_ulong_t flags)
1255 {
1256         long err = 0;
1257         unsigned long __user *nm = NULL;
1258         unsigned long nr_bits, alloc_size;
1259         nodemask_t bm;
1260
1261         nr_bits = min_t(unsigned long, maxnode-1, MAX_NUMNODES);
1262         alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8;
1263
1264         if (nmask) {
1265                 err = compat_get_bitmap(nodes_addr(bm), nmask, nr_bits);
1266                 nm = compat_alloc_user_space(alloc_size);
1267                 err |= copy_to_user(nm, nodes_addr(bm), alloc_size);
1268         }
1269
1270         if (err)
1271                 return -EFAULT;
1272
1273         return sys_mbind(start, len, mode, nm, nr_bits+1, flags);
1274 }
1275
1276 #endif
1277
1278 /*
1279  * get_vma_policy(@task, @vma, @addr)
1280  * @task - task for fallback if vma policy == default
1281  * @vma   - virtual memory area whose policy is sought
1282  * @addr  - address in @vma for shared policy lookup
1283  *
1284  * Returns effective policy for a VMA at specified address.
1285  * Falls back to @task or system default policy, as necessary.
1286  * Current or other task's task mempolicy and non-shared vma policies
1287  * are protected by the task's mmap_sem, which must be held for read by
1288  * the caller.
1289  * Shared policies [those marked as MPOL_F_SHARED] require an extra reference
1290  * count--added by the get_policy() vm_op, as appropriate--to protect against
1291  * freeing by another task.  It is the caller's responsibility to free the
1292  * extra reference for shared policies.
1293  */
1294 static struct mempolicy *get_vma_policy(struct task_struct *task,
1295                 struct vm_area_struct *vma, unsigned long addr)
1296 {
1297         struct mempolicy *pol = task->mempolicy;
1298
1299         if (vma) {
1300                 if (vma->vm_ops && vma->vm_ops->get_policy) {
1301                         struct mempolicy *vpol = vma->vm_ops->get_policy(vma,
1302                                                                         addr);
1303                         if (vpol)
1304                                 pol = vpol;
1305                 } else if (vma->vm_policy)
1306                         pol = vma->vm_policy;
1307         }
1308         if (!pol)
1309                 pol = &default_policy;
1310         return pol;
1311 }
1312
1313 /*
1314  * Return a nodemask representing a mempolicy for filtering nodes for
1315  * page allocation
1316  */
1317 static nodemask_t *policy_nodemask(gfp_t gfp, struct mempolicy *policy)
1318 {
1319         /* Lower zones don't get a nodemask applied for MPOL_BIND */
1320         if (unlikely(policy->mode == MPOL_BIND) &&
1321                         gfp_zone(gfp) >= policy_zone &&
1322                         cpuset_nodemask_valid_mems_allowed(&policy->v.nodes))
1323                 return &policy->v.nodes;
1324
1325         return NULL;
1326 }
1327
1328 /* Return a zonelist indicated by gfp for node representing a mempolicy */
1329 static struct zonelist *policy_zonelist(gfp_t gfp, struct mempolicy *policy)
1330 {
1331         int nd = numa_node_id();
1332
1333         switch (policy->mode) {
1334         case MPOL_PREFERRED:
1335                 if (!(policy->flags & MPOL_F_LOCAL))
1336                         nd = policy->v.preferred_node;
1337                 break;
1338         case MPOL_BIND:
1339                 /*
1340                  * Normally, MPOL_BIND allocations are node-local within the
1341                  * allowed nodemask.  However, if __GFP_THISNODE is set and the
1342                  * current node is part of the mask, we use the zonelist for
1343                  * the first node in the mask instead.
1344                  */
1345                 if (unlikely(gfp & __GFP_THISNODE) &&
1346                                 unlikely(!node_isset(nd, policy->v.nodes)))
1347                         nd = first_node(policy->v.nodes);
1348                 break;
1349         case MPOL_INTERLEAVE: /* should not happen */
1350                 break;
1351         default:
1352                 BUG();
1353         }
1354         return node_zonelist(nd, gfp);
1355 }
1356
1357 /* Do dynamic interleaving for a process */
1358 static unsigned interleave_nodes(struct mempolicy *policy)
1359 {
1360         unsigned nid, next;
1361         struct task_struct *me = current;
1362
1363         nid = me->il_next;
1364         next = next_node(nid, policy->v.nodes);
1365         if (next >= MAX_NUMNODES)
1366                 next = first_node(policy->v.nodes);
1367         if (next < MAX_NUMNODES)
1368                 me->il_next = next;
1369         return nid;
1370 }
1371
1372 /*
1373  * Depending on the memory policy provide a node from which to allocate the
1374  * next slab entry.
1375  * @policy must be protected by freeing by the caller.  If @policy is
1376  * the current task's mempolicy, this protection is implicit, as only the
1377  * task can change it's policy.  The system default policy requires no
1378  * such protection.
1379  */
1380 unsigned slab_node(struct mempolicy *policy)
1381 {
1382         if (!policy || policy->flags & MPOL_F_LOCAL)
1383                 return numa_node_id();
1384
1385         switch (policy->mode) {
1386         case MPOL_PREFERRED:
1387                 /*
1388                  * handled MPOL_F_LOCAL above
1389                  */
1390                 return policy->v.preferred_node;
1391
1392         case MPOL_INTERLEAVE:
1393                 return interleave_nodes(policy);
1394
1395         case MPOL_BIND: {
1396                 /*
1397                  * Follow bind policy behavior and start allocation at the
1398                  * first node.
1399                  */
1400                 struct zonelist *zonelist;
1401                 struct zone *zone;
1402                 enum zone_type highest_zoneidx = gfp_zone(GFP_KERNEL);
1403                 zonelist = &NODE_DATA(numa_node_id())->node_zonelists[0];
1404                 (void)first_zones_zonelist(zonelist, highest_zoneidx,
1405                                                         &policy->v.nodes,
1406                                                         &zone);
1407                 return zone ? zone->node : numa_node_id();
1408         }
1409
1410         default:
1411                 BUG();
1412         }
1413 }
1414
1415 /* Do static interleaving for a VMA with known offset. */
1416 static unsigned offset_il_node(struct mempolicy *pol,
1417                 struct vm_area_struct *vma, unsigned long off)
1418 {
1419         unsigned nnodes = nodes_weight(pol->v.nodes);
1420         unsigned target;
1421         int c;
1422         int nid = -1;
1423
1424         if (!nnodes)
1425                 return numa_node_id();
1426         target = (unsigned int)off % nnodes;
1427         c = 0;
1428         do {
1429                 nid = next_node(nid, pol->v.nodes);
1430                 c++;
1431         } while (c <= target);
1432         return nid;
1433 }
1434
1435 /* Determine a node number for interleave */
1436 static inline unsigned interleave_nid(struct mempolicy *pol,
1437                  struct vm_area_struct *vma, unsigned long addr, int shift)
1438 {
1439         if (vma) {
1440                 unsigned long off;
1441
1442                 /*
1443                  * for small pages, there is no difference between
1444                  * shift and PAGE_SHIFT, so the bit-shift is safe.
1445                  * for huge pages, since vm_pgoff is in units of small
1446                  * pages, we need to shift off the always 0 bits to get
1447                  * a useful offset.
1448                  */
1449                 BUG_ON(shift < PAGE_SHIFT);
1450                 off = vma->vm_pgoff >> (shift - PAGE_SHIFT);
1451                 off += (addr - vma->vm_start) >> shift;
1452                 return offset_il_node(pol, vma, off);
1453         } else
1454                 return interleave_nodes(pol);
1455 }
1456
1457 #ifdef CONFIG_HUGETLBFS
1458 /*
1459  * huge_zonelist(@vma, @addr, @gfp_flags, @mpol)
1460  * @vma = virtual memory area whose policy is sought
1461  * @addr = address in @vma for shared policy lookup and interleave policy
1462  * @gfp_flags = for requested zone
1463  * @mpol = pointer to mempolicy pointer for reference counted mempolicy
1464  * @nodemask = pointer to nodemask pointer for MPOL_BIND nodemask
1465  *
1466  * Returns a zonelist suitable for a huge page allocation and a pointer
1467  * to the struct mempolicy for conditional unref after allocation.
1468  * If the effective policy is 'BIND, returns a pointer to the mempolicy's
1469  * @nodemask for filtering the zonelist.
1470  */
1471 struct zonelist *huge_zonelist(struct vm_area_struct *vma, unsigned long addr,
1472                                 gfp_t gfp_flags, struct mempolicy **mpol,
1473                                 nodemask_t **nodemask)
1474 {
1475         struct zonelist *zl;
1476
1477         *mpol = get_vma_policy(current, vma, addr);
1478         *nodemask = NULL;       /* assume !MPOL_BIND */
1479
1480         if (unlikely((*mpol)->mode == MPOL_INTERLEAVE)) {
1481                 zl = node_zonelist(interleave_nid(*mpol, vma, addr,
1482                                 huge_page_shift(hstate_vma(vma))), gfp_flags);
1483         } else {
1484                 zl = policy_zonelist(gfp_flags, *mpol);
1485                 if ((*mpol)->mode == MPOL_BIND)
1486                         *nodemask = &(*mpol)->v.nodes;
1487         }
1488         return zl;
1489 }
1490 #endif
1491
1492 /* Allocate a page in interleaved policy.
1493    Own path because it needs to do special accounting. */
1494 static struct page *alloc_page_interleave(gfp_t gfp, unsigned order,
1495                                         unsigned nid)
1496 {
1497         struct zonelist *zl;
1498         struct page *page;
1499
1500         zl = node_zonelist(nid, gfp);
1501         page = __alloc_pages(gfp, order, zl);
1502         if (page && page_zone(page) == zonelist_zone(&zl->_zonerefs[0]))
1503                 inc_zone_page_state(page, NUMA_INTERLEAVE_HIT);
1504         return page;
1505 }
1506
1507 /**
1508  *      alloc_page_vma  - Allocate a page for a VMA.
1509  *
1510  *      @gfp:
1511  *      %GFP_USER    user allocation.
1512  *      %GFP_KERNEL  kernel allocations,
1513  *      %GFP_HIGHMEM highmem/user allocations,
1514  *      %GFP_FS      allocation should not call back into a file system.
1515  *      %GFP_ATOMIC  don't sleep.
1516  *
1517  *      @vma:  Pointer to VMA or NULL if not available.
1518  *      @addr: Virtual Address of the allocation. Must be inside the VMA.
1519  *
1520  *      This function allocates a page from the kernel page pool and applies
1521  *      a NUMA policy associated with the VMA or the current process.
1522  *      When VMA is not NULL caller must hold down_read on the mmap_sem of the
1523  *      mm_struct of the VMA to prevent it from going away. Should be used for
1524  *      all allocations for pages that will be mapped into
1525  *      user space. Returns NULL when no page can be allocated.
1526  *
1527  *      Should be called with the mm_sem of the vma hold.
1528  */
1529 struct page *
1530 alloc_page_vma(gfp_t gfp, struct vm_area_struct *vma, unsigned long addr)
1531 {
1532         struct mempolicy *pol = get_vma_policy(current, vma, addr);
1533         struct zonelist *zl;
1534
1535         cpuset_update_task_memory_state();
1536
1537         if (unlikely(pol->mode == MPOL_INTERLEAVE)) {
1538                 unsigned nid;
1539
1540                 nid = interleave_nid(pol, vma, addr, PAGE_SHIFT);
1541                 mpol_cond_put(pol);
1542                 return alloc_page_interleave(gfp, 0, nid);
1543         }
1544         zl = policy_zonelist(gfp, pol);
1545         if (unlikely(mpol_needs_cond_ref(pol))) {
1546                 /*
1547                  * slow path: ref counted shared policy
1548                  */
1549                 struct page *page =  __alloc_pages_nodemask(gfp, 0,
1550                                                 zl, policy_nodemask(gfp, pol));
1551                 __mpol_put(pol);
1552                 return page;
1553         }
1554         /*
1555          * fast path:  default or task policy
1556          */
1557         return __alloc_pages_nodemask(gfp, 0, zl, policy_nodemask(gfp, pol));
1558 }
1559
1560 /**
1561  *      alloc_pages_current - Allocate pages.
1562  *
1563  *      @gfp:
1564  *              %GFP_USER   user allocation,
1565  *              %GFP_KERNEL kernel allocation,
1566  *              %GFP_HIGHMEM highmem allocation,
1567  *              %GFP_FS     don't call back into a file system.
1568  *              %GFP_ATOMIC don't sleep.
1569  *      @order: Power of two of allocation size in pages. 0 is a single page.
1570  *
1571  *      Allocate a page from the kernel page pool.  When not in
1572  *      interrupt context and apply the current process NUMA policy.
1573  *      Returns NULL when no page can be allocated.
1574  *
1575  *      Don't call cpuset_update_task_memory_state() unless
1576  *      1) it's ok to take cpuset_sem (can WAIT), and
1577  *      2) allocating for current task (not interrupt).
1578  */
1579 struct page *alloc_pages_current(gfp_t gfp, unsigned order)
1580 {
1581         struct mempolicy *pol = current->mempolicy;
1582
1583         if ((gfp & __GFP_WAIT) && !in_interrupt())
1584                 cpuset_update_task_memory_state();
1585         if (!pol || in_interrupt() || (gfp & __GFP_THISNODE))
1586                 pol = &default_policy;
1587
1588         /*
1589          * No reference counting needed for current->mempolicy
1590          * nor system default_policy
1591          */
1592         if (pol->mode == MPOL_INTERLEAVE)
1593                 return alloc_page_interleave(gfp, order, interleave_nodes(pol));
1594         return __alloc_pages_nodemask(gfp, order,
1595                         policy_zonelist(gfp, pol), policy_nodemask(gfp, pol));
1596 }
1597 EXPORT_SYMBOL(alloc_pages_current);
1598
1599 /*
1600  * If mpol_dup() sees current->cpuset == cpuset_being_rebound, then it
1601  * rebinds the mempolicy its copying by calling mpol_rebind_policy()
1602  * with the mems_allowed returned by cpuset_mems_allowed().  This
1603  * keeps mempolicies cpuset relative after its cpuset moves.  See
1604  * further kernel/cpuset.c update_nodemask().
1605  */
1606
1607 /* Slow path of a mempolicy duplicate */
1608 struct mempolicy *__mpol_dup(struct mempolicy *old)
1609 {
1610         struct mempolicy *new = kmem_cache_alloc(policy_cache, GFP_KERNEL);
1611
1612         if (!new)
1613                 return ERR_PTR(-ENOMEM);
1614         if (current_cpuset_is_being_rebound()) {
1615                 nodemask_t mems = cpuset_mems_allowed(current);
1616                 mpol_rebind_policy(old, &mems);
1617         }
1618         *new = *old;
1619         atomic_set(&new->refcnt, 1);
1620         return new;
1621 }
1622
1623 /*
1624  * If *frompol needs [has] an extra ref, copy *frompol to *tompol ,
1625  * eliminate the * MPOL_F_* flags that require conditional ref and
1626  * [NOTE!!!] drop the extra ref.  Not safe to reference *frompol directly
1627  * after return.  Use the returned value.
1628  *
1629  * Allows use of a mempolicy for, e.g., multiple allocations with a single
1630  * policy lookup, even if the policy needs/has extra ref on lookup.
1631  * shmem_readahead needs this.
1632  */
1633 struct mempolicy *__mpol_cond_copy(struct mempolicy *tompol,
1634                                                 struct mempolicy *frompol)
1635 {
1636         if (!mpol_needs_cond_ref(frompol))
1637                 return frompol;
1638
1639         *tompol = *frompol;
1640         tompol->flags &= ~MPOL_F_SHARED;        /* copy doesn't need unref */
1641         __mpol_put(frompol);
1642         return tompol;
1643 }
1644
1645 static int mpol_match_intent(const struct mempolicy *a,
1646                              const struct mempolicy *b)
1647 {
1648         if (a->flags != b->flags)
1649                 return 0;
1650         if (!mpol_store_user_nodemask(a))
1651                 return 1;
1652         return nodes_equal(a->w.user_nodemask, b->w.user_nodemask);
1653 }
1654
1655 /* Slow path of a mempolicy comparison */
1656 int __mpol_equal(struct mempolicy *a, struct mempolicy *b)
1657 {
1658         if (!a || !b)
1659                 return 0;
1660         if (a->mode != b->mode)
1661                 return 0;
1662         if (a->mode != MPOL_DEFAULT && !mpol_match_intent(a, b))
1663                 return 0;
1664         switch (a->mode) {
1665         case MPOL_BIND:
1666                 /* Fall through */
1667         case MPOL_INTERLEAVE:
1668                 return nodes_equal(a->v.nodes, b->v.nodes);
1669         case MPOL_PREFERRED:
1670                 return a->v.preferred_node == b->v.preferred_node &&
1671                         a->flags == b->flags;
1672         default:
1673                 BUG();
1674                 return 0;
1675         }
1676 }
1677
1678 /*
1679  * Shared memory backing store policy support.
1680  *
1681  * Remember policies even when nobody has shared memory mapped.
1682  * The policies are kept in Red-Black tree linked from the inode.
1683  * They are protected by the sp->lock spinlock, which should be held
1684  * for any accesses to the tree.
1685  */
1686
1687 /* lookup first element intersecting start-end */
1688 /* Caller holds sp->lock */
1689 static struct sp_node *
1690 sp_lookup(struct shared_policy *sp, unsigned long start, unsigned long end)
1691 {
1692         struct rb_node *n = sp->root.rb_node;
1693
1694         while (n) {
1695                 struct sp_node *p = rb_entry(n, struct sp_node, nd);
1696
1697                 if (start >= p->end)
1698                         n = n->rb_right;
1699                 else if (end <= p->start)
1700                         n = n->rb_left;
1701                 else
1702                         break;
1703         }
1704         if (!n)
1705                 return NULL;
1706         for (;;) {
1707                 struct sp_node *w = NULL;
1708                 struct rb_node *prev = rb_prev(n);
1709                 if (!prev)
1710                         break;
1711                 w = rb_entry(prev, struct sp_node, nd);
1712                 if (w->end <= start)
1713                         break;
1714                 n = prev;
1715         }
1716         return rb_entry(n, struct sp_node, nd);
1717 }
1718
1719 /* Insert a new shared policy into the list. */
1720 /* Caller holds sp->lock */
1721 static void sp_insert(struct shared_policy *sp, struct sp_node *new)
1722 {
1723         struct rb_node **p = &sp->root.rb_node;
1724         struct rb_node *parent = NULL;
1725         struct sp_node *nd;
1726
1727         while (*p) {
1728                 parent = *p;
1729                 nd = rb_entry(parent, struct sp_node, nd);
1730                 if (new->start < nd->start)
1731                         p = &(*p)->rb_left;
1732                 else if (new->end > nd->end)
1733                         p = &(*p)->rb_right;
1734                 else
1735                         BUG();
1736         }
1737         rb_link_node(&new->nd, parent, p);
1738         rb_insert_color(&new->nd, &sp->root);
1739         pr_debug("inserting %lx-%lx: %d\n", new->start, new->end,
1740                  new->policy ? new->policy->mode : 0);
1741 }
1742
1743 /* Find shared policy intersecting idx */
1744 struct mempolicy *
1745 mpol_shared_policy_lookup(struct shared_policy *sp, unsigned long idx)
1746 {
1747         struct mempolicy *pol = NULL;
1748         struct sp_node *sn;
1749
1750         if (!sp->root.rb_node)
1751                 return NULL;
1752         spin_lock(&sp->lock);
1753         sn = sp_lookup(sp, idx, idx+1);
1754         if (sn) {
1755                 mpol_get(sn->policy);
1756                 pol = sn->policy;
1757         }
1758         spin_unlock(&sp->lock);
1759         return pol;
1760 }
1761
1762 static void sp_delete(struct shared_policy *sp, struct sp_node *n)
1763 {
1764         pr_debug("deleting %lx-l%lx\n", n->start, n->end);
1765         rb_erase(&n->nd, &sp->root);
1766         mpol_put(n->policy);
1767         kmem_cache_free(sn_cache, n);
1768 }
1769
1770 static struct sp_node *sp_alloc(unsigned long start, unsigned long end,
1771                                 struct mempolicy *pol)
1772 {
1773         struct sp_node *n = kmem_cache_alloc(sn_cache, GFP_KERNEL);
1774
1775         if (!n)
1776                 return NULL;
1777         n->start = start;
1778         n->end = end;
1779         mpol_get(pol);
1780         pol->flags |= MPOL_F_SHARED;    /* for unref */
1781         n->policy = pol;
1782         return n;
1783 }
1784
1785 /* Replace a policy range. */
1786 static int shared_policy_replace(struct shared_policy *sp, unsigned long start,
1787                                  unsigned long end, struct sp_node *new)
1788 {
1789         struct sp_node *n, *new2 = NULL;
1790
1791 restart:
1792         spin_lock(&sp->lock);
1793         n = sp_lookup(sp, start, end);
1794         /* Take care of old policies in the same range. */
1795         while (n && n->start < end) {
1796                 struct rb_node *next = rb_next(&n->nd);
1797                 if (n->start >= start) {
1798                         if (n->end <= end)
1799                                 sp_delete(sp, n);
1800                         else
1801                                 n->start = end;
1802                 } else {
1803                         /* Old policy spanning whole new range. */
1804                         if (n->end > end) {
1805                                 if (!new2) {
1806                                         spin_unlock(&sp->lock);
1807                                         new2 = sp_alloc(end, n->end, n->policy);
1808                                         if (!new2)
1809                                                 return -ENOMEM;
1810                                         goto restart;
1811                                 }
1812                                 n->end = start;
1813                                 sp_insert(sp, new2);
1814                                 new2 = NULL;
1815                                 break;
1816                         } else
1817                                 n->end = start;
1818                 }
1819                 if (!next)
1820                         break;
1821                 n = rb_entry(next, struct sp_node, nd);
1822         }
1823         if (new)
1824                 sp_insert(sp, new);
1825         spin_unlock(&sp->lock);
1826         if (new2) {
1827                 mpol_put(new2->policy);
1828                 kmem_cache_free(sn_cache, new2);
1829         }
1830         return 0;
1831 }
1832
1833 /**
1834  * mpol_shared_policy_init - initialize shared policy for inode
1835  * @sp: pointer to inode shared policy
1836  * @mpol:  struct mempolicy to install
1837  *
1838  * Install non-NULL @mpol in inode's shared policy rb-tree.
1839  * On entry, the current task has a reference on a non-NULL @mpol.
1840  * This must be released on exit.
1841  */
1842 void mpol_shared_policy_init(struct shared_policy *sp, struct mempolicy *mpol)
1843 {
1844         sp->root = RB_ROOT;             /* empty tree == default mempolicy */
1845         spin_lock_init(&sp->lock);
1846
1847         if (mpol) {
1848                 struct vm_area_struct pvma;
1849                 struct mempolicy *new;
1850
1851                 /* contextualize the tmpfs mount point mempolicy */
1852                 new = mpol_new(mpol->mode, mpol->flags, &mpol->w.user_nodemask);
1853                 mpol_put(mpol); /* drop our ref on sb mpol */
1854                 if (IS_ERR(new))
1855                         return;         /* no valid nodemask intersection */
1856
1857                 /* Create pseudo-vma that contains just the policy */
1858                 memset(&pvma, 0, sizeof(struct vm_area_struct));
1859                 pvma.vm_end = TASK_SIZE;        /* policy covers entire file */
1860                 mpol_set_shared_policy(sp, &pvma, new); /* adds ref */
1861                 mpol_put(new);                  /* drop initial ref */
1862         }
1863 }
1864
1865 int mpol_set_shared_policy(struct shared_policy *info,
1866                         struct vm_area_struct *vma, struct mempolicy *npol)
1867 {
1868         int err;
1869         struct sp_node *new = NULL;
1870         unsigned long sz = vma_pages(vma);
1871
1872         pr_debug("set_shared_policy %lx sz %lu %d %d %lx\n",
1873                  vma->vm_pgoff,
1874                  sz, npol ? npol->mode : -1,
1875                  npol ? npol->flags : -1,
1876                  npol ? nodes_addr(npol->v.nodes)[0] : -1);
1877
1878         if (npol) {
1879                 new = sp_alloc(vma->vm_pgoff, vma->vm_pgoff + sz, npol);
1880                 if (!new)
1881                         return -ENOMEM;
1882         }
1883         err = shared_policy_replace(info, vma->vm_pgoff, vma->vm_pgoff+sz, new);
1884         if (err && new)
1885                 kmem_cache_free(sn_cache, new);
1886         return err;
1887 }
1888
1889 /* Free a backing policy store on inode delete. */
1890 void mpol_free_shared_policy(struct shared_policy *p)
1891 {
1892         struct sp_node *n;
1893         struct rb_node *next;
1894
1895         if (!p->root.rb_node)
1896                 return;
1897         spin_lock(&p->lock);
1898         next = rb_first(&p->root);
1899         while (next) {
1900                 n = rb_entry(next, struct sp_node, nd);
1901                 next = rb_next(&n->nd);
1902                 rb_erase(&n->nd, &p->root);
1903                 mpol_put(n->policy);
1904                 kmem_cache_free(sn_cache, n);
1905         }
1906         spin_unlock(&p->lock);
1907 }
1908
1909 /* assumes fs == KERNEL_DS */
1910 void __init numa_policy_init(void)
1911 {
1912         nodemask_t interleave_nodes;
1913         unsigned long largest = 0;
1914         int nid, prefer = 0;
1915
1916         policy_cache = kmem_cache_create("numa_policy",
1917                                          sizeof(struct mempolicy),
1918                                          0, SLAB_PANIC, NULL);
1919
1920         sn_cache = kmem_cache_create("shared_policy_node",
1921                                      sizeof(struct sp_node),
1922                                      0, SLAB_PANIC, NULL);
1923
1924         /*
1925          * Set interleaving policy for system init. Interleaving is only
1926          * enabled across suitably sized nodes (default is >= 16MB), or
1927          * fall back to the largest node if they're all smaller.
1928          */
1929         nodes_clear(interleave_nodes);
1930         for_each_node_state(nid, N_HIGH_MEMORY) {
1931                 unsigned long total_pages = node_present_pages(nid);
1932
1933                 /* Preserve the largest node */
1934                 if (largest < total_pages) {
1935                         largest = total_pages;
1936                         prefer = nid;
1937                 }
1938
1939                 /* Interleave this node? */
1940                 if ((total_pages << PAGE_SHIFT) >= (16 << 20))
1941                         node_set(nid, interleave_nodes);
1942         }
1943
1944         /* All too small, use the largest */
1945         if (unlikely(nodes_empty(interleave_nodes)))
1946                 node_set(prefer, interleave_nodes);
1947
1948         if (do_set_mempolicy(MPOL_INTERLEAVE, 0, &interleave_nodes))
1949                 printk("numa_policy_init: interleaving failed\n");
1950 }
1951
1952 /* Reset policy of current process to default */
1953 void numa_default_policy(void)
1954 {
1955         do_set_mempolicy(MPOL_DEFAULT, 0, NULL);
1956 }
1957
1958 /*
1959  * Parse and format mempolicy from/to strings
1960  */
1961
1962 /*
1963  * "local" is pseudo-policy:  MPOL_PREFERRED with MPOL_F_LOCAL flag
1964  * Used only for mpol_parse_str() and mpol_to_str()
1965  */
1966 #define MPOL_LOCAL (MPOL_INTERLEAVE + 1)
1967 static const char * const policy_types[] =
1968         { "default", "prefer", "bind", "interleave", "local" };
1969
1970
1971 #ifdef CONFIG_TMPFS
1972 /**
1973  * mpol_parse_str - parse string to mempolicy
1974  * @str:  string containing mempolicy to parse
1975  * @mpol:  pointer to struct mempolicy pointer, returned on success.
1976  * @no_context:  flag whether to "contextualize" the mempolicy
1977  *
1978  * Format of input:
1979  *      <mode>[=<flags>][:<nodelist>]
1980  *
1981  * if @no_context is true, save the input nodemask in w.user_nodemask in
1982  * the returned mempolicy.  This will be used to "clone" the mempolicy in
1983  * a specific context [cpuset] at a later time.  Used to parse tmpfs mpol
1984  * mount option.  Note that if 'static' or 'relative' mode flags were
1985  * specified, the input nodemask will already have been saved.  Saving
1986  * it again is redundant, but safe.
1987  *
1988  * On success, returns 0, else 1
1989  */
1990 int mpol_parse_str(char *str, struct mempolicy **mpol, int no_context)
1991 {
1992         struct mempolicy *new = NULL;
1993         unsigned short uninitialized_var(mode);
1994         unsigned short uninitialized_var(mode_flags);
1995         nodemask_t nodes;
1996         char *nodelist = strchr(str, ':');
1997         char *flags = strchr(str, '=');
1998         int i;
1999         int err = 1;
2000
2001         if (nodelist) {
2002                 /* NUL-terminate mode or flags string */
2003                 *nodelist++ = '\0';
2004                 if (nodelist_parse(nodelist, nodes))
2005                         goto out;
2006                 if (!nodes_subset(nodes, node_states[N_HIGH_MEMORY]))
2007                         goto out;
2008         } else
2009                 nodes_clear(nodes);
2010
2011         if (flags)
2012                 *flags++ = '\0';        /* terminate mode string */
2013
2014         for (i = 0; i <= MPOL_LOCAL; i++) {
2015                 if (!strcmp(str, policy_types[i])) {
2016                         mode = i;
2017                         break;
2018                 }
2019         }
2020         if (i > MPOL_LOCAL)
2021                 goto out;
2022
2023         switch (mode) {
2024         case MPOL_PREFERRED:
2025                 /*
2026                  * Insist on a nodelist of one node only
2027                  */
2028                 if (nodelist) {
2029                         char *rest = nodelist;
2030                         while (isdigit(*rest))
2031                                 rest++;
2032                         if (*rest)
2033                                 goto out;
2034                 }
2035                 break;
2036         case MPOL_INTERLEAVE:
2037                 /*
2038                  * Default to online nodes with memory if no nodelist
2039                  */
2040                 if (!nodelist)
2041                         nodes = node_states[N_HIGH_MEMORY];
2042                 break;
2043         case MPOL_LOCAL:
2044                 /*
2045                  * Don't allow a nodelist;  mpol_new() checks flags
2046                  */
2047                 if (nodelist)
2048                         goto out;
2049                 mode = MPOL_PREFERRED;
2050                 break;
2051         case MPOL_DEFAULT:
2052                 /*
2053                  * Insist on a empty nodelist
2054                  */
2055                 if (!nodelist)
2056                         err = 0;
2057                 goto out;
2058         case MPOL_BIND:
2059                 /*
2060                  * Insist on a nodelist
2061                  */
2062                 if (!nodelist)
2063                         goto out;
2064         }
2065
2066         mode_flags = 0;
2067         if (flags) {
2068                 /*
2069                  * Currently, we only support two mutually exclusive
2070                  * mode flags.
2071                  */
2072                 if (!strcmp(flags, "static"))
2073                         mode_flags |= MPOL_F_STATIC_NODES;
2074                 else if (!strcmp(flags, "relative"))
2075                         mode_flags |= MPOL_F_RELATIVE_NODES;
2076                 else
2077                         goto out;
2078         }
2079
2080         new = mpol_new(mode, mode_flags, &nodes);
2081         if (IS_ERR(new))
2082                 goto out;
2083         err = 0;
2084         if (no_context) {
2085                 /* save for contextualization */
2086                 new->w.user_nodemask = nodes;
2087         }
2088
2089 out:
2090         /* Restore string for error message */
2091         if (nodelist)
2092                 *--nodelist = ':';
2093         if (flags)
2094                 *--flags = '=';
2095         if (!err)
2096                 *mpol = new;
2097         return err;
2098 }
2099 #endif /* CONFIG_TMPFS */
2100
2101 /**
2102  * mpol_to_str - format a mempolicy structure for printing
2103  * @buffer:  to contain formatted mempolicy string
2104  * @maxlen:  length of @buffer
2105  * @pol:  pointer to mempolicy to be formatted
2106  * @no_context:  "context free" mempolicy - use nodemask in w.user_nodemask
2107  *
2108  * Convert a mempolicy into a string.
2109  * Returns the number of characters in buffer (if positive)
2110  * or an error (negative)
2111  */
2112 int mpol_to_str(char *buffer, int maxlen, struct mempolicy *pol, int no_context)
2113 {
2114         char *p = buffer;
2115         int l;
2116         nodemask_t nodes;
2117         unsigned short mode;
2118         unsigned short flags = pol ? pol->flags : 0;
2119
2120         /*
2121          * Sanity check:  room for longest mode, flag and some nodes
2122          */
2123         VM_BUG_ON(maxlen < strlen("interleave") + strlen("relative") + 16);
2124
2125         if (!pol || pol == &default_policy)
2126                 mode = MPOL_DEFAULT;
2127         else
2128                 mode = pol->mode;
2129
2130         switch (mode) {
2131         case MPOL_DEFAULT:
2132                 nodes_clear(nodes);
2133                 break;
2134
2135         case MPOL_PREFERRED:
2136                 nodes_clear(nodes);
2137                 if (flags & MPOL_F_LOCAL)
2138                         mode = MPOL_LOCAL;      /* pseudo-policy */
2139                 else
2140                         node_set(pol->v.preferred_node, nodes);
2141                 break;
2142
2143         case MPOL_BIND:
2144                 /* Fall through */
2145         case MPOL_INTERLEAVE:
2146                 if (no_context)
2147                         nodes = pol->w.user_nodemask;
2148                 else
2149                         nodes = pol->v.nodes;
2150                 break;
2151
2152         default:
2153                 BUG();
2154         }
2155
2156         l = strlen(policy_types[mode]);
2157         if (buffer + maxlen < p + l + 1)
2158                 return -ENOSPC;
2159
2160         strcpy(p, policy_types[mode]);
2161         p += l;
2162
2163         if (flags & MPOL_MODE_FLAGS) {
2164                 if (buffer + maxlen < p + 2)
2165                         return -ENOSPC;
2166                 *p++ = '=';
2167
2168                 /*
2169                  * Currently, the only defined flags are mutually exclusive
2170                  */
2171                 if (flags & MPOL_F_STATIC_NODES)
2172                         p += snprintf(p, buffer + maxlen - p, "static");
2173                 else if (flags & MPOL_F_RELATIVE_NODES)
2174                         p += snprintf(p, buffer + maxlen - p, "relative");
2175         }
2176
2177         if (!nodes_empty(nodes)) {
2178                 if (buffer + maxlen < p + 2)
2179                         return -ENOSPC;
2180                 *p++ = ':';
2181                 p += nodelist_scnprintf(p, buffer + maxlen - p, nodes);
2182         }
2183         return p - buffer;
2184 }
2185
2186 struct numa_maps {
2187         unsigned long pages;
2188         unsigned long anon;
2189         unsigned long active;
2190         unsigned long writeback;
2191         unsigned long mapcount_max;
2192         unsigned long dirty;
2193         unsigned long swapcache;
2194         unsigned long node[MAX_NUMNODES];
2195 };
2196
2197 static void gather_stats(struct page *page, void *private, int pte_dirty)
2198 {
2199         struct numa_maps *md = private;
2200         int count = page_mapcount(page);
2201
2202         md->pages++;
2203         if (pte_dirty || PageDirty(page))
2204                 md->dirty++;
2205
2206         if (PageSwapCache(page))
2207                 md->swapcache++;
2208
2209         if (PageActive(page))
2210                 md->active++;
2211
2212         if (PageWriteback(page))
2213                 md->writeback++;
2214
2215         if (PageAnon(page))
2216                 md->anon++;
2217
2218         if (count > md->mapcount_max)
2219                 md->mapcount_max = count;
2220
2221         md->node[page_to_nid(page)]++;
2222 }
2223
2224 #ifdef CONFIG_HUGETLB_PAGE
2225 static void check_huge_range(struct vm_area_struct *vma,
2226                 unsigned long start, unsigned long end,
2227                 struct numa_maps *md)
2228 {
2229         unsigned long addr;
2230         struct page *page;
2231         struct hstate *h = hstate_vma(vma);
2232         unsigned long sz = huge_page_size(h);
2233
2234         for (addr = start; addr < end; addr += sz) {
2235                 pte_t *ptep = huge_pte_offset(vma->vm_mm,
2236                                                 addr & huge_page_mask(h));
2237                 pte_t pte;
2238
2239                 if (!ptep)
2240                         continue;
2241
2242                 pte = *ptep;
2243                 if (pte_none(pte))
2244                         continue;
2245
2246                 page = pte_page(pte);
2247                 if (!page)
2248                         continue;
2249
2250                 gather_stats(page, md, pte_dirty(*ptep));
2251         }
2252 }
2253 #else
2254 static inline void check_huge_range(struct vm_area_struct *vma,
2255                 unsigned long start, unsigned long end,
2256                 struct numa_maps *md)
2257 {
2258 }
2259 #endif
2260
2261 /*
2262  * Display pages allocated per node and memory policy via /proc.
2263  */
2264 int show_numa_map(struct seq_file *m, void *v)
2265 {
2266         struct proc_maps_private *priv = m->private;
2267         struct vm_area_struct *vma = v;
2268         struct numa_maps *md;
2269         struct file *file = vma->vm_file;
2270         struct mm_struct *mm = vma->vm_mm;
2271         struct mempolicy *pol;
2272         int n;
2273         char buffer[50];
2274
2275         if (!mm)
2276                 return 0;
2277
2278         md = kzalloc(sizeof(struct numa_maps), GFP_KERNEL);
2279         if (!md)
2280                 return 0;
2281
2282         pol = get_vma_policy(priv->task, vma, vma->vm_start);
2283         mpol_to_str(buffer, sizeof(buffer), pol, 0);
2284         mpol_cond_put(pol);
2285
2286         seq_printf(m, "%08lx %s", vma->vm_start, buffer);
2287
2288         if (file) {
2289                 seq_printf(m, " file=");
2290                 seq_path(m, &file->f_path, "\n\t= ");
2291         } else if (vma->vm_start <= mm->brk && vma->vm_end >= mm->start_brk) {
2292                 seq_printf(m, " heap");
2293         } else if (vma->vm_start <= mm->start_stack &&
2294                         vma->vm_end >= mm->start_stack) {
2295                 seq_printf(m, " stack");
2296         }
2297
2298         if (is_vm_hugetlb_page(vma)) {
2299                 check_huge_range(vma, vma->vm_start, vma->vm_end, md);
2300                 seq_printf(m, " huge");
2301         } else {
2302                 check_pgd_range(vma, vma->vm_start, vma->vm_end,
2303                         &node_states[N_HIGH_MEMORY], MPOL_MF_STATS, md);
2304         }
2305
2306         if (!md->pages)
2307                 goto out;
2308
2309         if (md->anon)
2310                 seq_printf(m," anon=%lu",md->anon);
2311
2312         if (md->dirty)
2313                 seq_printf(m," dirty=%lu",md->dirty);
2314
2315         if (md->pages != md->anon && md->pages != md->dirty)
2316                 seq_printf(m, " mapped=%lu", md->pages);
2317
2318         if (md->mapcount_max > 1)
2319                 seq_printf(m, " mapmax=%lu", md->mapcount_max);
2320
2321         if (md->swapcache)
2322                 seq_printf(m," swapcache=%lu", md->swapcache);
2323
2324         if (md->active < md->pages && !is_vm_hugetlb_page(vma))
2325                 seq_printf(m," active=%lu", md->active);
2326
2327         if (md->writeback)
2328                 seq_printf(m," writeback=%lu", md->writeback);
2329
2330         for_each_node_state(n, N_HIGH_MEMORY)
2331                 if (md->node[n])
2332                         seq_printf(m, " N%d=%lu", n, md->node[n]);
2333 out:
2334         seq_putc(m, '\n');
2335         kfree(md);
2336
2337         if (m->count < m->size)
2338                 m->version = (vma != priv->tail_vma) ? vma->vm_start : 0;
2339         return 0;
2340 }