mm/mempolicy.c

   1 /*
   2  * Simple NUMA memory policy for the Linux kernel.
   3  *
   4  * Copyright 2003,2004 Andi Kleen, SuSE Labs.
   5  * (C) Copyright 2005 Christoph Lameter, Silicon Graphics, Inc.
   6  * Subject to the GNU Public License, version 2.
   7  *
   8  * NUMA policy allows the user to give hints in which node(s) memory should
   9  * be allocated.
  10  *
  11  * Support four policies per VMA and per process:
  12  *
  13  * The VMA policy has priority over the process policy for a page fault.
  14  *
  15  * interleave     Allocate memory interleaved over a set of nodes,
  16  *                with normal fallback if it fails.
  17  *                For VMA based allocations this interleaves based on the
  18  *                offset into the backing object or offset into the mapping
  19  *                for anonymous memory. For process policy an process counter
  20  *                is used.
  21  *
  22  * bind           Only allocate memory on a specific set of nodes,
  23  *                no fallback.
  24  *                FIXME: memory is allocated starting with the first node
  25  *                to the last. It would be better if bind would truly restrict
  26  *                the allocation to memory nodes instead
  27  *
  28  * preferred       Try a specific node first before normal fallback.
  29  *                As a special case NUMA_NO_NODE here means do the allocation
  30  *                on the local CPU. This is normally identical to default,
  31  *                but useful to set in a VMA when you have a non default
  32  *                process policy.
  33  *
  34  * default        Allocate on the local node first, or when on a VMA
  35  *                use the process policy. This is what Linux always did
  36  *                in a NUMA aware kernel and still does by, ahem, default.
  37  *
  38  * The process policy is applied for most non interrupt memory allocations
  39  * in that process' context. Interrupts ignore the policies and always
  40  * try to allocate on the local CPU. The VMA policy is only applied for memory
  41  * allocations for a VMA in the VM.
  42  *
  43  * Currently there are a few corner cases in swapping where the policy
  44  * is not applied, but the majority should be handled. When process policy
  45  * is used it is not remembered over swap outs/swap ins.
  46  *
  47  * Only the highest zone in the zone hierarchy gets policied. Allocations
  48  * requesting a lower zone just use default policy. This implies that
  49  * on systems with highmem kernel lowmem allocation don't get policied.
  50  * Same with GFP_DMA allocations.
  51  *
  52  * For shmfs/tmpfs/hugetlbfs shared memory the policy is shared between
  53  * all users and remembered even when nobody has memory mapped.
  54  */
  55
  56 /* Notebook:
  57    fix mmap readahead to honour policy and enable policy for any page cache
  58    object
  59    statistics for bigpages
  60    global policy for page cache? currently it uses process policy. Requires
  61    first item above.
  62    handle mremap for shared memory (currently ignored for the policy)
  63    grows down?
  64    make bind policy root only? It can trigger oom much faster and the
  65    kernel is not always grateful with that.
  66 */
  67
  68 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
  69
  70 #include <linux/mempolicy.h>
  71 #include <linux/mm.h>
  72 #include <linux/highmem.h>
  73 #include <linux/hugetlb.h>
  74 #include <linux/kernel.h>
  75 #include <linux/sched.h>
  76 #include <linux/nodemask.h>
  77 #include <linux/cpuset.h>
  78 #include <linux/slab.h>
  79 #include <linux/string.h>
  80 #include <linux/export.h>
  81 #include <linux/nsproxy.h>
  82 #include <linux/interrupt.h>
  83 #include <linux/init.h>
  84 #include <linux/compat.h>
  85 #include <linux/swap.h>
  86 #include <linux/seq_file.h>
  87 #include <linux/proc_fs.h>
  88 #include <linux/migrate.h>
  89 #include <linux/ksm.h>
  90 #include <linux/rmap.h>
  91 #include <linux/security.h>
  92 #include <linux/syscalls.h>
  93 #include <linux/ctype.h>
  94 #include <linux/mm_inline.h>
  95 #include <linux/mmu_notifier.h>
  96 #include <linux/printk.h>
  97
  98 #include <asm/tlbflush.h>
  99 #include <asm/uaccess.h>
 100 #include <linux/random.h>
 101
 102 #include "internal.h"
 103
 104 /* Internal flags */
 105 #define MPOL_MF_DISCONTIG_OK (MPOL_MF_INTERNAL << 0)    /* Skip checks for continuous vmas */
 106 #define MPOL_MF_INVERT (MPOL_MF_INTERNAL << 1)          /* Invert check for nodemask */
 107
 108 static struct kmem_cache *policy_cache;
 109 static struct kmem_cache *sn_cache;
 110
 111 /* Highest zone. An specific allocation for a zone below that is not
 112    policied. */
 113 enum zone_type policy_zone = 0;
 114
 115 /*
 116  * run-time system-wide default policy => local allocation
 117  */
 118 static struct mempolicy default_policy = {
 119         .refcnt = ATOMIC_INIT(1), /* never free it */
 120         .mode = MPOL_PREFERRED,
 121         .flags = MPOL_F_LOCAL,
 122 };
 123
 124 static struct mempolicy preferred_node_policy[MAX_NUMNODES];
 125
 126 static struct mempolicy *get_task_policy(struct task_struct *p)
 127 {
 128         struct mempolicy *pol = p->mempolicy;
 129
 130         if (!pol) {
 131                 int node = numa_node_id();
 132
 133                 if (node != NUMA_NO_NODE) {
 134                         pol = &preferred_node_policy[node];
 135                         /*
 136                          * preferred_node_policy is not initialised early in
 137                          * boot
 138                          */
 139                         if (!pol->mode)
 140                                 pol = NULL;
 141                 }
 142         }
 143
 144         return pol;
 145 }
 146
 147 static const struct mempolicy_operations {
 148         int (*create)(struct mempolicy *pol, const nodemask_t *nodes);
 149         /*
 150          * If read-side task has no lock to protect task->mempolicy, write-side
 151          * task will rebind the task->mempolicy by two step. The first step is
 152          * setting all the newly nodes, and the second step is cleaning all the
 153          * disallowed nodes. In this way, we can avoid finding no node to alloc
 154          * page.
 155          * If we have a lock to protect task->mempolicy in read-side, we do
 156          * rebind directly.
 157          *
 158          * step:
 159          *      MPOL_REBIND_ONCE - do rebind work at once
 160          *      MPOL_REBIND_STEP1 - set all the newly nodes
 161          *      MPOL_REBIND_STEP2 - clean all the disallowed nodes
 162          */
 163         void (*rebind)(struct mempolicy *pol, const nodemask_t *nodes,
 164                         enum mpol_rebind_step step);
 165 } mpol_ops[MPOL_MAX];
 166
 167 /* Check that the nodemask contains at least one populated zone */
 168 static int is_valid_nodemask(const nodemask_t *nodemask)
 169 {
 170         return nodes_intersects(*nodemask, node_states[N_MEMORY]);
 171 }
 172
 173 static inline int mpol_store_user_nodemask(const struct mempolicy *pol)
 174 {
 175         return pol->flags & MPOL_MODE_FLAGS;
 176 }
 177
 178 static void mpol_relative_nodemask(nodemask_t *ret, const nodemask_t *orig,
 179                                    const nodemask_t *rel)
 180 {
 181         nodemask_t tmp;
 182         nodes_fold(tmp, *orig, nodes_weight(*rel));
 183         nodes_onto(*ret, tmp, *rel);
 184 }
 185
 186 static int mpol_new_interleave(struct mempolicy *pol, const nodemask_t *nodes)
 187 {
 188         if (nodes_empty(*nodes))
 189                 return -EINVAL;
 190         pol->v.nodes = *nodes;
 191         return 0;
 192 }
 193
 194 static int mpol_new_preferred(struct mempolicy *pol, const nodemask_t *nodes)
 195 {
 196         if (!nodes)
 197                 pol->flags |= MPOL_F_LOCAL;     /* local allocation */
 198         else if (nodes_empty(*nodes))
 199                 return -EINVAL;                 /*  no allowed nodes */
 200         else
 201                 pol->v.preferred_node = first_node(*nodes);
 202         return 0;
 203 }
 204
 205 static int mpol_new_bind(struct mempolicy *pol, const nodemask_t *nodes)
 206 {
 207         if (!is_valid_nodemask(nodes))
 208                 return -EINVAL;
 209         pol->v.nodes = *nodes;
 210         return 0;
 211 }
 212
 213 /*
 214  * mpol_set_nodemask is called after mpol_new() to set up the nodemask, if
 215  * any, for the new policy.  mpol_new() has already validated the nodes
 216  * parameter with respect to the policy mode and flags.  But, we need to
 217  * handle an empty nodemask with MPOL_PREFERRED here.
 218  *
 219  * Must be called holding task's alloc_lock to protect task's mems_allowed
 220  * and mempolicy.  May also be called holding the mmap_semaphore for write.
 221  */
 222 static int mpol_set_nodemask(struct mempolicy *pol,
 223                      const nodemask_t *nodes, struct nodemask_scratch *nsc)
 224 {
 225         int ret;
 226
 227         /* if mode is MPOL_DEFAULT, pol is NULL. This is right. */
 228         if (pol == NULL)
 229                 return 0;
 230         /* Check N_MEMORY */
 231         nodes_and(nsc->mask1,
 232                   cpuset_current_mems_allowed, node_states[N_MEMORY]);
 233
 234         VM_BUG_ON(!nodes);
 235         if (pol->mode == MPOL_PREFERRED && nodes_empty(*nodes))
 236                 nodes = NULL;   /* explicit local allocation */
 237         else {
 238                 if (pol->flags & MPOL_F_RELATIVE_NODES)
 239                         mpol_relative_nodemask(&nsc->mask2, nodes,&nsc->mask1);
 240                 else
 241                         nodes_and(nsc->mask2, *nodes, nsc->mask1);
 242
 243                 if (mpol_store_user_nodemask(pol))
 244                         pol->w.user_nodemask = *nodes;
 245                 else
 246                         pol->w.cpuset_mems_allowed =
 247                                                 cpuset_current_mems_allowed;
 248         }
 249
 250         if (nodes)
 251                 ret = mpol_ops[pol->mode].create(pol, &nsc->mask2);
 252         else
 253                 ret = mpol_ops[pol->mode].create(pol, NULL);
 254         return ret;
 255 }
 256
 257 /*
 258  * This function just creates a new policy, does some check and simple
 259  * initialization. You must invoke mpol_set_nodemask() to set nodes.
 260  */
 261 static struct mempolicy *mpol_new(unsigned short mode, unsigned short flags,
 262                                   nodemask_t *nodes)
 263 {
 264         struct mempolicy *policy;
 265
 266         pr_debug("setting mode %d flags %d nodes[0] %lx\n",
 267                  mode, flags, nodes ? nodes_addr(*nodes)[0] : NUMA_NO_NODE);
 268
 269         if (mode == MPOL_DEFAULT) {
 270                 if (nodes && !nodes_empty(*nodes))
 271                         return ERR_PTR(-EINVAL);
 272                 return NULL;
 273         }
 274         VM_BUG_ON(!nodes);
 275
 276         /*
 277          * MPOL_PREFERRED cannot be used with MPOL_F_STATIC_NODES or
 278          * MPOL_F_RELATIVE_NODES if the nodemask is empty (local allocation).
 279          * All other modes require a valid pointer to a non-empty nodemask.
 280          */
 281         if (mode == MPOL_PREFERRED) {
 282                 if (nodes_empty(*nodes)) {
 283                         if (((flags & MPOL_F_STATIC_NODES) ||
 284                              (flags & MPOL_F_RELATIVE_NODES)))
 285                                 return ERR_PTR(-EINVAL);
 286                 }
 287         } else if (mode == MPOL_LOCAL) {
 288                 if (!nodes_empty(*nodes))
 289                         return ERR_PTR(-EINVAL);
 290                 mode = MPOL_PREFERRED;
 291         } else if (nodes_empty(*nodes))
 292                 return ERR_PTR(-EINVAL);
 293         policy = kmem_cache_alloc(policy_cache, GFP_KERNEL);
 294         if (!policy)
 295                 return ERR_PTR(-ENOMEM);
 296         atomic_set(&policy->refcnt, 1);
 297         policy->mode = mode;
 298         policy->flags = flags;
 299
 300         return policy;
 301 }
 302
 303 /* Slow path of a mpol destructor. */
 304 void __mpol_put(struct mempolicy *p)
 305 {
 306         if (!atomic_dec_and_test(&p->refcnt))
 307                 return;
 308         kmem_cache_free(policy_cache, p);
 309 }
 310
 311 static void mpol_rebind_default(struct mempolicy *pol, const nodemask_t *nodes,
 312                                 enum mpol_rebind_step step)
 313 {
 314 }
 315
 316 /*
 317  * step:
 318  *      MPOL_REBIND_ONCE  - do rebind work at once
 319  *      MPOL_REBIND_STEP1 - set all the newly nodes
 320  *      MPOL_REBIND_STEP2 - clean all the disallowed nodes
 321  */
 322 static void mpol_rebind_nodemask(struct mempolicy *pol, const nodemask_t *nodes,
 323                                  enum mpol_rebind_step step)
 324 {
 325         nodemask_t tmp;
 326
 327         if (pol->flags & MPOL_F_STATIC_NODES)
 328                 nodes_and(tmp, pol->w.user_nodemask, *nodes);
 329         else if (pol->flags & MPOL_F_RELATIVE_NODES)
 330                 mpol_relative_nodemask(&tmp, &pol->w.user_nodemask, nodes);
 331         else {
 332                 /*
 333                  * if step == 1, we use ->w.cpuset_mems_allowed to cache the
 334                  * result
 335                  */
 336                 if (step == MPOL_REBIND_ONCE || step == MPOL_REBIND_STEP1) {
 337                         nodes_remap(tmp, pol->v.nodes,
 338                                         pol->w.cpuset_mems_allowed, *nodes);
 339                         pol->w.cpuset_mems_allowed = step ? tmp : *nodes;
 340                 } else if (step == MPOL_REBIND_STEP2) {
 341                         tmp = pol->w.cpuset_mems_allowed;
 342                         pol->w.cpuset_mems_allowed = *nodes;
 343                 } else
 344                         BUG();
 345         }
 346
 347         if (nodes_empty(tmp))
 348                 tmp = *nodes;
 349
 350         if (step == MPOL_REBIND_STEP1)
 351                 nodes_or(pol->v.nodes, pol->v.nodes, tmp);
 352         else if (step == MPOL_REBIND_ONCE || step == MPOL_REBIND_STEP2)
 353                 pol->v.nodes = tmp;
 354         else
 355                 BUG();
 356
 357         if (!node_isset(current->il_next, tmp)) {
 358                 current->il_next = next_node(current->il_next, tmp);
 359                 if (current->il_next >= MAX_NUMNODES)
 360                         current->il_next = first_node(tmp);
 361                 if (current->il_next >= MAX_NUMNODES)
 362                         current->il_next = numa_node_id();
 363         }
 364 }
 365
 366 static void mpol_rebind_preferred(struct mempolicy *pol,
 367                                   const nodemask_t *nodes,
 368                                   enum mpol_rebind_step step)
 369 {
 370         nodemask_t tmp;
 371
 372         if (pol->flags & MPOL_F_STATIC_NODES) {
 373                 int node = first_node(pol->w.user_nodemask);
 374
 375                 if (node_isset(node, *nodes)) {
 376                         pol->v.preferred_node = node;
 377                         pol->flags &= ~MPOL_F_LOCAL;
 378                 } else
 379                         pol->flags |= MPOL_F_LOCAL;
 380         } else if (pol->flags & MPOL_F_RELATIVE_NODES) {
 381                 mpol_relative_nodemask(&tmp, &pol->w.user_nodemask, nodes);
 382                 pol->v.preferred_node = first_node(tmp);
 383         } else if (!(pol->flags & MPOL_F_LOCAL)) {
 384                 pol->v.preferred_node = node_remap(pol->v.preferred_node,
 385                                                    pol->w.cpuset_mems_allowed,
 386                                                    *nodes);
 387                 pol->w.cpuset_mems_allowed = *nodes;
 388         }
 389 }
 390
 391 /*
 392  * mpol_rebind_policy - Migrate a policy to a different set of nodes
 393  *
 394  * If read-side task has no lock to protect task->mempolicy, write-side
 395  * task will rebind the task->mempolicy by two step. The first step is
 396  * setting all the newly nodes, and the second step is cleaning all the
 397  * disallowed nodes. In this way, we can avoid finding no node to alloc
 398  * page.
 399  * If we have a lock to protect task->mempolicy in read-side, we do
 400  * rebind directly.
 401  *
 402  * step:
 403  *      MPOL_REBIND_ONCE  - do rebind work at once
 404  *      MPOL_REBIND_STEP1 - set all the newly nodes
 405  *      MPOL_REBIND_STEP2 - clean all the disallowed nodes
 406  */
 407 static void mpol_rebind_policy(struct mempolicy *pol, const nodemask_t *newmask,
 408                                 enum mpol_rebind_step step)
 409 {
 410         if (!pol)
 411                 return;
 412         if (!mpol_store_user_nodemask(pol) && step == MPOL_REBIND_ONCE &&
 413             nodes_equal(pol->w.cpuset_mems_allowed, *newmask))
 414                 return;
 415
 416         if (step == MPOL_REBIND_STEP1 && (pol->flags & MPOL_F_REBINDING))
 417                 return;
 418
 419         if (step == MPOL_REBIND_STEP2 && !(pol->flags & MPOL_F_REBINDING))
 420                 BUG();
 421
 422         if (step == MPOL_REBIND_STEP1)
 423                 pol->flags |= MPOL_F_REBINDING;
 424         else if (step == MPOL_REBIND_STEP2)
 425                 pol->flags &= ~MPOL_F_REBINDING;
 426         else if (step >= MPOL_REBIND_NSTEP)
 427                 BUG();
 428
 429         mpol_ops[pol->mode].rebind(pol, newmask, step);
 430 }
 431
 432 /*
 433  * Wrapper for mpol_rebind_policy() that just requires task
 434  * pointer, and updates task mempolicy.
 435  *
 436  * Called with task's alloc_lock held.
 437  */
 438
 439 void mpol_rebind_task(struct task_struct *tsk, const nodemask_t *new,
 440                         enum mpol_rebind_step step)
 441 {
 442         mpol_rebind_policy(tsk->mempolicy, new, step);
 443 }
 444
 445 /*
 446  * Rebind each vma in mm to new nodemask.
 447  *
 448  * Call holding a reference to mm.  Takes mm->mmap_sem during call.
 449  */
 450
 451 void mpol_rebind_mm(struct mm_struct *mm, nodemask_t *new)
 452 {
 453         struct vm_area_struct *vma;
 454
 455         down_write(&mm->mmap_sem);
 456         for (vma = mm->mmap; vma; vma = vma->vm_next)
 457                 mpol_rebind_policy(vma->vm_policy, new, MPOL_REBIND_ONCE);
 458         up_write(&mm->mmap_sem);
 459 }
 460
 461 static const struct mempolicy_operations mpol_ops[MPOL_MAX] = {
 462         [MPOL_DEFAULT] = {
 463                 .rebind = mpol_rebind_default,
 464         },
 465         [MPOL_INTERLEAVE] = {
 466                 .create = mpol_new_interleave,
 467                 .rebind = mpol_rebind_nodemask,
 468         },
 469         [MPOL_PREFERRED] = {
 470                 .create = mpol_new_preferred,
 471                 .rebind = mpol_rebind_preferred,
 472         },
 473         [MPOL_BIND] = {
 474                 .create = mpol_new_bind,
 475                 .rebind = mpol_rebind_nodemask,
 476         },
 477 };
 478
 479 static void migrate_page_add(struct page *page, struct list_head *pagelist,
 480                                 unsigned long flags);
 481
 482 /*
 483  * Scan through pages checking if pages follow certain conditions,
 484  * and move them to the pagelist if they do.
 485  */
 486 static int queue_pages_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
 487                 unsigned long addr, unsigned long end,
 488                 const nodemask_t *nodes, unsigned long flags,
 489                 void *private)
 490 {
 491         pte_t *orig_pte;
 492         pte_t *pte;
 493         spinlock_t *ptl;
 494
 495         orig_pte = pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
 496         do {
 497                 struct page *page;
 498                 int nid;
 499
 500                 if (!pte_present(*pte))
 501                         continue;
 502                 page = vm_normal_page(vma, addr, *pte);
 503                 if (!page)
 504                         continue;
 505                 /*
 506                  * vm_normal_page() filters out zero pages, but there might
 507                  * still be PageReserved pages to skip, perhaps in a VDSO.
 508                  */
 509                 if (PageReserved(page))
 510                         continue;
 511                 nid = page_to_nid(page);
 512                 if (node_isset(nid, *nodes) == !!(flags & MPOL_MF_INVERT))
 513                         continue;
 514
 515                 if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL))
 516                         migrate_page_add(page, private, flags);
 517                 else
 518                         break;
 519         } while (pte++, addr += PAGE_SIZE, addr != end);
 520         pte_unmap_unlock(orig_pte, ptl);
 521         return addr != end;
 522 }
 523
 524 static void queue_pages_hugetlb_pmd_range(struct vm_area_struct *vma,
 525                 pmd_t *pmd, const nodemask_t *nodes, unsigned long flags,
 526                                     void *private)
 527 {
 528 #ifdef CONFIG_HUGETLB_PAGE
 529         int nid;
 530         struct page *page;
 531         spinlock_t *ptl;
 532         pte_t entry;
 533
 534         ptl = huge_pte_lock(hstate_vma(vma), vma->vm_mm, (pte_t *)pmd);
 535         entry = huge_ptep_get((pte_t *)pmd);
 536         if (!pte_present(entry))
 537                 goto unlock;
 538         page = pte_page(entry);
 539         nid = page_to_nid(page);
 540         if (node_isset(nid, *nodes) == !!(flags & MPOL_MF_INVERT))
 541                 goto unlock;
 542         /* With MPOL_MF_MOVE, we migrate only unshared hugepage. */
 543         if (flags & (MPOL_MF_MOVE_ALL) ||
 544             (flags & MPOL_MF_MOVE && page_mapcount(page) == 1))
 545                 isolate_huge_page(page, private);
 546 unlock:
 547         spin_unlock(ptl);
 548 #else
 549         BUG();
 550 #endif
 551 }
 552
 553 static inline int queue_pages_pmd_range(struct vm_area_struct *vma, pud_t *pud,
 554                 unsigned long addr, unsigned long end,
 555                 const nodemask_t *nodes, unsigned long flags,
 556                 void *private)
 557 {
 558         pmd_t *pmd;
 559         unsigned long next;
 560
 561         pmd = pmd_offset(pud, addr);
 562         do {
 563                 next = pmd_addr_end(addr, end);
 564                 if (!pmd_present(*pmd))
 565                         continue;
 566                 if (pmd_huge(*pmd) && is_vm_hugetlb_page(vma)) {
 567                         queue_pages_hugetlb_pmd_range(vma, pmd, nodes,
 568                                                 flags, private);
 569                         continue;
 570                 }
 571                 split_huge_page_pmd(vma, addr, pmd);
 572                 if (pmd_none_or_trans_huge_or_clear_bad(pmd))
 573                         continue;
 574                 if (queue_pages_pte_range(vma, pmd, addr, next, nodes,
 575                                     flags, private))
 576                         return -EIO;
 577         } while (pmd++, addr = next, addr != end);
 578         return 0;
 579 }
 580
 581 static inline int queue_pages_pud_range(struct vm_area_struct *vma, pgd_t *pgd,
 582                 unsigned long addr, unsigned long end,
 583                 const nodemask_t *nodes, unsigned long flags,
 584                 void *private)
 585 {
 586         pud_t *pud;
 587         unsigned long next;
 588
 589         pud = pud_offset(pgd, addr);
 590         do {
 591                 next = pud_addr_end(addr, end);
 592                 if (pud_huge(*pud) && is_vm_hugetlb_page(vma))
 593                         continue;
 594                 if (pud_none_or_clear_bad(pud))
 595                         continue;
 596                 if (queue_pages_pmd_range(vma, pud, addr, next, nodes,
 597                                     flags, private))
 598                         return -EIO;
 599         } while (pud++, addr = next, addr != end);
 600         return 0;
 601 }
 602
 603 static inline int queue_pages_pgd_range(struct vm_area_struct *vma,
 604                 unsigned long addr, unsigned long end,
 605                 const nodemask_t *nodes, unsigned long flags,
 606                 void *private)
 607 {
 608         pgd_t *pgd;
 609         unsigned long next;
 610
 611         pgd = pgd_offset(vma->vm_mm, addr);
 612         do {
 613                 next = pgd_addr_end(addr, end);
 614                 if (pgd_none_or_clear_bad(pgd))
 615                         continue;
 616                 if (queue_pages_pud_range(vma, pgd, addr, next, nodes,
 617                                     flags, private))
 618                         return -EIO;
 619         } while (pgd++, addr = next, addr != end);
 620         return 0;
 621 }
 622
 623 #ifdef CONFIG_NUMA_BALANCING
 624 /*
 625  * This is used to mark a range of virtual addresses to be inaccessible.
 626  * These are later cleared by a NUMA hinting fault. Depending on these
 627  * faults, pages may be migrated for better NUMA placement.
 628  *
 629  * This is assuming that NUMA faults are handled using PROT_NONE. If
 630  * an architecture makes a different choice, it will need further
 631  * changes to the core.
 632  */
 633 unsigned long change_prot_numa(struct vm_area_struct *vma,
 634                         unsigned long addr, unsigned long end)
 635 {
 636         int nr_updated;
 637
 638         nr_updated = change_protection(vma, addr, end, vma->vm_page_prot, 0, 1);
 639         if (nr_updated)
 640                 count_vm_numa_events(NUMA_PTE_UPDATES, nr_updated);
 641
 642         return nr_updated;
 643 }
 644 #else
 645 static unsigned long change_prot_numa(struct vm_area_struct *vma,
 646                         unsigned long addr, unsigned long end)
 647 {
 648         return 0;
 649 }
 650 #endif /* CONFIG_NUMA_BALANCING */
 651
 652 /*
 653  * Walk through page tables and collect pages to be migrated.
 654  *
 655  * If pages found in a given range are on a set of nodes (determined by
 656  * @nodes and @flags,) it's isolated and queued to the pagelist which is
 657  * passed via @private.)
 658  */
 659 static int
 660 queue_pages_range(struct mm_struct *mm, unsigned long start, unsigned long end,
 661                 const nodemask_t *nodes, unsigned long flags, void *private)
 662 {
 663         int err = 0;
 664         struct vm_area_struct *vma, *prev;
 665
 666         vma = find_vma(mm, start);
 667         if (!vma)
 668                 return -EFAULT;
 669         prev = NULL;
 670         for (; vma && vma->vm_start < end; vma = vma->vm_next) {
 671                 unsigned long endvma = vma->vm_end;
 672
 673                 if (endvma > end)
 674                         endvma = end;
 675                 if (vma->vm_start > start)
 676                         start = vma->vm_start;
 677
 678                 if (!(flags & MPOL_MF_DISCONTIG_OK)) {
 679                         if (!vma->vm_next && vma->vm_end < end)
 680                                 return -EFAULT;
 681                         if (prev && prev->vm_end < vma->vm_start)
 682                                 return -EFAULT;
 683                 }
 684
 685                 if (flags & MPOL_MF_LAZY) {
 686                         change_prot_numa(vma, start, endvma);
 687                         goto next;
 688                 }
 689
 690                 if ((flags & MPOL_MF_STRICT) ||
 691                      ((flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) &&
 692                       vma_migratable(vma))) {
 693
 694                         err = queue_pages_pgd_range(vma, start, endvma, nodes,
 695                                                 flags, private);
 696                         if (err)
 697                                 break;
 698                 }
 699 next:
 700                 prev = vma;
 701         }
 702         return err;
 703 }
 704
 705 /*
 706  * Apply policy to a single VMA
 707  * This must be called with the mmap_sem held for writing.
 708  */
 709 static int vma_replace_policy(struct vm_area_struct *vma,
 710                                                 struct mempolicy *pol)
 711 {
 712         int err;
 713         struct mempolicy *old;
 714         struct mempolicy *new;
 715
 716         pr_debug("vma %lx-%lx/%lx vm_ops %p vm_file %p set_policy %p\n",
 717                  vma->vm_start, vma->vm_end, vma->vm_pgoff,
 718                  vma->vm_ops, vma->vm_file,
 719                  vma->vm_ops ? vma->vm_ops->set_policy : NULL);
 720
 721         new = mpol_dup(pol);
 722         if (IS_ERR(new))
 723                 return PTR_ERR(new);
 724
 725         if (vma->vm_ops && vma->vm_ops->set_policy) {
 726                 err = vma->vm_ops->set_policy(vma, new);
 727                 if (err)
 728                         goto err_out;
 729         }
 730
 731         old = vma->vm_policy;
 732         vma->vm_policy = new; /* protected by mmap_sem */
 733         mpol_put(old);
 734
 735         return 0;
 736  err_out:
 737         mpol_put(new);
 738         return err;
 739 }
 740
 741 /* Step 2: apply policy to a range and do splits. */
 742 static int mbind_range(struct mm_struct *mm, unsigned long start,
 743                        unsigned long end, struct mempolicy *new_pol)
 744 {
 745         struct vm_area_struct *next;
 746         struct vm_area_struct *prev;
 747         struct vm_area_struct *vma;
 748         int err = 0;
 749         pgoff_t pgoff;
 750         unsigned long vmstart;
 751         unsigned long vmend;
 752
 753         vma = find_vma(mm, start);
 754         if (!vma || vma->vm_start > start)
 755                 return -EFAULT;
 756
 757         prev = vma->vm_prev;
 758         if (start > vma->vm_start)
 759                 prev = vma;
 760
 761         for (; vma && vma->vm_start < end; prev = vma, vma = next) {
 762                 next = vma->vm_next;
 763                 vmstart = max(start, vma->vm_start);
 764                 vmend   = min(end, vma->vm_end);
 765
 766                 if (mpol_equal(vma_policy(vma), new_pol))
 767                         continue;
 768
 769                 pgoff = vma->vm_pgoff +
 770                         ((vmstart - vma->vm_start) >> PAGE_SHIFT);
 771                 prev = vma_merge(mm, prev, vmstart, vmend, vma->vm_flags,
 772                                   vma->anon_vma, vma->vm_file, pgoff,
 773                                   new_pol);
 774                 if (prev) {
 775                         vma = prev;
 776                         next = vma->vm_next;
 777                         if (mpol_equal(vma_policy(vma), new_pol))
 778                                 continue;
 779                         /* vma_merge() joined vma && vma->next, case 8 */
 780                         goto replace;
 781                 }
 782                 if (vma->vm_start != vmstart) {
 783                         err = split_vma(vma->vm_mm, vma, vmstart, 1);
 784                         if (err)
 785                                 goto out;
 786                 }
 787                 if (vma->vm_end != vmend) {
 788                         err = split_vma(vma->vm_mm, vma, vmend, 0);
 789                         if (err)
 790                                 goto out;
 791                 }
 792  replace:
 793                 err = vma_replace_policy(vma, new_pol);
 794                 if (err)
 795                         goto out;
 796         }
 797
 798  out:
 799         return err;
 800 }
 801
 802 /* Set the process memory policy */
 803 static long do_set_mempolicy(unsigned short mode, unsigned short flags,
 804                              nodemask_t *nodes)
 805 {
 806         struct mempolicy *new, *old;
 807         struct mm_struct *mm = current->mm;
 808         NODEMASK_SCRATCH(scratch);
 809         int ret;
 810
 811         if (!scratch)
 812                 return -ENOMEM;
 813
 814         new = mpol_new(mode, flags, nodes);
 815         if (IS_ERR(new)) {
 816                 ret = PTR_ERR(new);
 817                 goto out;
 818         }
 819         /*
 820          * prevent changing our mempolicy while show_numa_maps()
 821          * is using it.
 822          * Note:  do_set_mempolicy() can be called at init time
 823          * with no 'mm'.
 824          */
 825         if (mm)
 826                 down_write(&mm->mmap_sem);
 827         task_lock(current);
 828         ret = mpol_set_nodemask(new, nodes, scratch);
 829         if (ret) {
 830                 task_unlock(current);
 831                 if (mm)
 832                         up_write(&mm->mmap_sem);
 833                 mpol_put(new);
 834                 goto out;
 835         }
 836         old = current->mempolicy;
 837         current->mempolicy = new;
 838         if (new && new->mode == MPOL_INTERLEAVE &&
 839             nodes_weight(new->v.nodes))
 840                 current->il_next = first_node(new->v.nodes);
 841         task_unlock(current);
 842         if (mm)
 843                 up_write(&mm->mmap_sem);
 844
 845         mpol_put(old);
 846         ret = 0;
 847 out:
 848         NODEMASK_SCRATCH_FREE(scratch);
 849         return ret;
 850 }
 851
 852 /*
 853  * Return nodemask for policy for get_mempolicy() query
 854  *
 855  * Called with task's alloc_lock held
 856  */
 857 static void get_policy_nodemask(struct mempolicy *p, nodemask_t *nodes)
 858 {
 859         nodes_clear(*nodes);
 860         if (p == &default_policy)
 861                 return;
 862
 863         switch (p->mode) {
 864         case MPOL_BIND:
 865                 /* Fall through */
 866         case MPOL_INTERLEAVE:
 867                 *nodes = p->v.nodes;
 868                 break;
 869         case MPOL_PREFERRED:
 870                 if (!(p->flags & MPOL_F_LOCAL))
 871                         node_set(p->v.preferred_node, *nodes);
 872                 /* else return empty node mask for local allocation */
 873                 break;
 874         default:
 875                 BUG();
 876         }
 877 }
 878
 879 static int lookup_node(struct mm_struct *mm, unsigned long addr)
 880 {
 881         struct page *p;
 882         int err;
 883
 884         err = get_user_pages(current, mm, addr & PAGE_MASK, 1, 0, 0, &p, NULL);
 885         if (err >= 0) {
 886                 err = page_to_nid(p);
 887                 put_page(p);
 888         }
 889         return err;
 890 }
 891
 892 /* Retrieve NUMA policy */
 893 static long do_get_mempolicy(int *policy, nodemask_t *nmask,
 894                              unsigned long addr, unsigned long flags)
 895 {
 896         int err;
 897         struct mm_struct *mm = current->mm;
 898         struct vm_area_struct *vma = NULL;
 899         struct mempolicy *pol = current->mempolicy;
 900
 901         if (flags &
 902                 ~(unsigned long)(MPOL_F_NODE|MPOL_F_ADDR|MPOL_F_MEMS_ALLOWED))
 903                 return -EINVAL;
 904
 905         if (flags & MPOL_F_MEMS_ALLOWED) {
 906                 if (flags & (MPOL_F_NODE|MPOL_F_ADDR))
 907                         return -EINVAL;
 908                 *policy = 0;    /* just so it's initialized */
 909                 task_lock(current);
 910                 *nmask  = cpuset_current_mems_allowed;
 911                 task_unlock(current);
 912                 return 0;
 913         }
 914
 915         if (flags & MPOL_F_ADDR) {
 916                 /*
 917                  * Do NOT fall back to task policy if the
 918                  * vma/shared policy at addr is NULL.  We
 919                  * want to return MPOL_DEFAULT in this case.
 920                  */
 921                 down_read(&mm->mmap_sem);
 922                 vma = find_vma_intersection(mm, addr, addr+1);
 923                 if (!vma) {
 924                         up_read(&mm->mmap_sem);
 925                         return -EFAULT;
 926                 }
 927                 if (vma->vm_ops && vma->vm_ops->get_policy)
 928                         pol = vma->vm_ops->get_policy(vma, addr);
 929                 else
 930                         pol = vma->vm_policy;
 931         } else if (addr)
 932                 return -EINVAL;
 933
 934         if (!pol)
 935                 pol = &default_policy;  /* indicates default behavior */
 936
 937         if (flags & MPOL_F_NODE) {
 938                 if (flags & MPOL_F_ADDR) {
 939                         err = lookup_node(mm, addr);
 940                         if (err < 0)
 941                                 goto out;
 942                         *policy = err;
 943                 } else if (pol == current->mempolicy &&
 944                                 pol->mode == MPOL_INTERLEAVE) {
 945                         *policy = current->il_next;
 946                 } else {
 947                         err = -EINVAL;
 948                         goto out;
 949                 }
 950         } else {
 951                 *policy = pol == &default_policy ? MPOL_DEFAULT :
 952                                                 pol->mode;
 953                 /*
 954                  * Internal mempolicy flags must be masked off before exposing
 955                  * the policy to userspace.
 956                  */
 957                 *policy |= (pol->flags & MPOL_MODE_FLAGS);
 958         }
 959
 960         err = 0;
 961         if (nmask) {
 962                 if (mpol_store_user_nodemask(pol)) {
 963                         *nmask = pol->w.user_nodemask;
 964                 } else {
 965                         task_lock(current);
 966                         get_policy_nodemask(pol, nmask);
 967                         task_unlock(current);
 968                 }
 969         }
 970
 971  out:
 972         mpol_cond_put(pol);
 973         if (vma)
 974                 up_read(&current->mm->mmap_sem);
 975         return err;
 976 }
 977
 978 #ifdef CONFIG_MIGRATION
 979 /*
 980  * page migration
 981  */
 982 static void migrate_page_add(struct page *page, struct list_head *pagelist,
 983                                 unsigned long flags)
 984 {
 985         /*
 986          * Avoid migrating a page that is shared with others.
 987          */
 988         if ((flags & MPOL_MF_MOVE_ALL) || page_mapcount(page) == 1) {
 989                 if (!isolate_lru_page(page)) {
 990                         list_add_tail(&page->lru, pagelist);
 991                         inc_zone_page_state(page, NR_ISOLATED_ANON +
 992                                             page_is_file_cache(page));
 993                 }
 994         }
 995 }
 996
 997 static struct page *new_node_page(struct page *page, unsigned long node, int **x)
 998 {
 999         if (PageHuge(page))
1000                 return alloc_huge_page_node(page_hstate(compound_head(page)),
1001                                         node);
1002         else
1003                 return alloc_pages_exact_node(node, GFP_HIGHUSER_MOVABLE, 0);
1004 }
1005
1006 /*
1007  * Migrate pages from one node to a target node.
1008  * Returns error or the number of pages not migrated.
1009  */
1010 static int migrate_to_node(struct mm_struct *mm, int source, int dest,
1011                            int flags)
1012 {
1013         nodemask_t nmask;
1014         LIST_HEAD(pagelist);
1015         int err = 0;
1016
1017         nodes_clear(nmask);
1018         node_set(source, nmask);
1019
1020         /*
1021          * This does not "check" the range but isolates all pages that
1022          * need migration.  Between passing in the full user address
1023          * space range and MPOL_MF_DISCONTIG_OK, this call can not fail.
1024          */
1025         VM_BUG_ON(!(flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)));
1026         queue_pages_range(mm, mm->mmap->vm_start, mm->task_size, &nmask,
1027                         flags | MPOL_MF_DISCONTIG_OK, &pagelist);
1028
1029         if (!list_empty(&pagelist)) {
1030                 err = migrate_pages(&pagelist, new_node_page, NULL, dest,
1031                                         MIGRATE_SYNC, MR_SYSCALL);
1032                 if (err)
1033                         putback_movable_pages(&pagelist);
1034         }
1035
1036         return err;
1037 }
1038
1039 /*
1040  * Move pages between the two nodesets so as to preserve the physical
1041  * layout as much as possible.
1042  *
1043  * Returns the number of page that could not be moved.
1044  */
1045 int do_migrate_pages(struct mm_struct *mm, const nodemask_t *from,
1046                      const nodemask_t *to, int flags)
1047 {
1048         int busy = 0;
1049         int err;
1050         nodemask_t tmp;
1051
1052         err = migrate_prep();
1053         if (err)
1054                 return err;
1055
1056         down_read(&mm->mmap_sem);
1057
1058         err = migrate_vmas(mm, from, to, flags);
1059         if (err)
1060                 goto out;
1061
1062         /*
1063          * Find a 'source' bit set in 'tmp' whose corresponding 'dest'
1064          * bit in 'to' is not also set in 'tmp'.  Clear the found 'source'
1065          * bit in 'tmp', and return that <source, dest> pair for migration.
1066          * The pair of nodemasks 'to' and 'from' define the map.
1067          *
1068          * If no pair of bits is found that way, fallback to picking some
1069          * pair of 'source' and 'dest' bits that are not the same.  If the
1070          * 'source' and 'dest' bits are the same, this represents a node
1071          * that will be migrating to itself, so no pages need move.
1072          *
1073          * If no bits are left in 'tmp', or if all remaining bits left
1074          * in 'tmp' correspond to the same bit in 'to', return false
1075          * (nothing left to migrate).
1076          *
1077          * This lets us pick a pair of nodes to migrate between, such that
1078          * if possible the dest node is not already occupied by some other
1079          * source node, minimizing the risk of overloading the memory on a
1080          * node that would happen if we migrated incoming memory to a node
1081          * before migrating outgoing memory source that same node.
1082          *
1083          * A single scan of tmp is sufficient.  As we go, we remember the
1084          * most recent <s, d> pair that moved (s != d).  If we find a pair
1085          * that not only moved, but what's better, moved to an empty slot
1086          * (d is not set in tmp), then we break out then, with that pair.
1087          * Otherwise when we finish scanning from_tmp, we at least have the
1088          * most recent <s, d> pair that moved.  If we get all the way through
1089          * the scan of tmp without finding any node that moved, much less
1090          * moved to an empty node, then there is nothing left worth migrating.
1091          */
1092
1093         tmp = *from;
1094         while (!nodes_empty(tmp)) {
1095                 int s,d;
1096                 int source = NUMA_NO_NODE;
1097                 int dest = 0;
1098
1099                 for_each_node_mask(s, tmp) {
1100
1101                         /*
1102                          * do_migrate_pages() tries to maintain the relative
1103                          * node relationship of the pages established between
1104                          * threads and memory areas.
1105                          *
1106                          * However if the number of source nodes is not equal to
1107                          * the number of destination nodes we can not preserve
1108                          * this node relative relationship.  In that case, skip
1109                          * copying memory from a node that is in the destination
1110                          * mask.
1111                          *
1112                          * Example: [2,3,4] -> [3,4,5] moves everything.
1113                          *          [0-7] - > [3,4,5] moves only 0,1,2,6,7.
1114                          */
1115
1116                         if ((nodes_weight(*from) != nodes_weight(*to)) &&
1117                                                 (node_isset(s, *to)))
1118                                 continue;
1119
1120                         d = node_remap(s, *from, *to);
1121                         if (s == d)
1122                                 continue;
1123
1124                         source = s;     /* Node moved. Memorize */
1125                         dest = d;
1126
1127                         /* dest not in remaining from nodes? */
1128                         if (!node_isset(dest, tmp))
1129                                 break;
1130                 }
1131                 if (source == NUMA_NO_NODE)
1132                         break;
1133
1134                 node_clear(source, tmp);
1135                 err = migrate_to_node(mm, source, dest, flags);
1136                 if (err > 0)
1137                         busy += err;
1138                 if (err < 0)
1139                         break;
1140         }
1141 out:
1142         up_read(&mm->mmap_sem);
1143         if (err < 0)
1144                 return err;
1145         return busy;
1146
1147 }
1148
1149 /*
1150  * Allocate a new page for page migration based on vma policy.
1151  * Start by assuming the page is mapped by the same vma as contains @start.
1152  * Search forward from there, if not.  N.B., this assumes that the
1153  * list of pages handed to migrate_pages()--which is how we get here--
1154  * is in virtual address order.
1155  */
1156 static struct page *new_page(struct page *page, unsigned long start, int **x)
1157 {
1158         struct vm_area_struct *vma;
1159         unsigned long uninitialized_var(address);
1160
1161         vma = find_vma(current->mm, start);
1162         while (vma) {
1163                 address = page_address_in_vma(page, vma);
1164                 if (address != -EFAULT)
1165                         break;
1166                 vma = vma->vm_next;
1167         }
1168
1169         if (PageHuge(page)) {
1170                 BUG_ON(!vma);
1171                 return alloc_huge_page_noerr(vma, address, 1);
1172         }
1173         /*
1174          * if !vma, alloc_page_vma() will use task or system default policy
1175          */
1176         return alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, address);
1177 }
1178 #else
1179
1180 static void migrate_page_add(struct page *page, struct list_head *pagelist,
1181                                 unsigned long flags)
1182 {
1183 }
1184
1185 int do_migrate_pages(struct mm_struct *mm, const nodemask_t *from,
1186                      const nodemask_t *to, int flags)
1187 {
1188         return -ENOSYS;
1189 }
1190
1191 static struct page *new_page(struct page *page, unsigned long start, int **x)
1192 {
1193         return NULL;
1194 }
1195 #endif
1196
1197 static long do_mbind(unsigned long start, unsigned long len,
1198                      unsigned short mode, unsigned short mode_flags,
1199                      nodemask_t *nmask, unsigned long flags)
1200 {
1201         struct mm_struct *mm = current->mm;
1202         struct mempolicy *new;
1203         unsigned long end;
1204         int err;
1205         LIST_HEAD(pagelist);
1206
1207         if (flags & ~(unsigned long)MPOL_MF_VALID)
1208                 return -EINVAL;
1209         if ((flags & MPOL_MF_MOVE_ALL) && !capable(CAP_SYS_NICE))
1210                 return -EPERM;
1211
1212         if (start & ~PAGE_MASK)
1213                 return -EINVAL;
1214
1215         if (mode == MPOL_DEFAULT)
1216                 flags &= ~MPOL_MF_STRICT;
1217
1218         len = (len + PAGE_SIZE - 1) & PAGE_MASK;
1219         end = start + len;
1220
1221         if (end < start)
1222                 return -EINVAL;
1223         if (end == start)
1224                 return 0;
1225
1226         new = mpol_new(mode, mode_flags, nmask);
1227         if (IS_ERR(new))
1228                 return PTR_ERR(new);
1229
1230         if (flags & MPOL_MF_LAZY)
1231                 new->flags |= MPOL_F_MOF;
1232
1233         /*
1234          * If we are using the default policy then operation
1235          * on discontinuous address spaces is okay after all
1236          */
1237         if (!new)
1238                 flags |= MPOL_MF_DISCONTIG_OK;
1239
1240         pr_debug("mbind %lx-%lx mode:%d flags:%d nodes:%lx\n",
1241                  start, start + len, mode, mode_flags,
1242                  nmask ? nodes_addr(*nmask)[0] : NUMA_NO_NODE);
1243
1244         if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) {
1245
1246                 err = migrate_prep();
1247                 if (err)
1248                         goto mpol_out;
1249         }
1250         {
1251                 NODEMASK_SCRATCH(scratch);
1252                 if (scratch) {
1253                         down_write(&mm->mmap_sem);
1254                         task_lock(current);
1255                         err = mpol_set_nodemask(new, nmask, scratch);
1256                         task_unlock(current);
1257                         if (err)
1258                                 up_write(&mm->mmap_sem);
1259                 } else
1260                         err = -ENOMEM;
1261                 NODEMASK_SCRATCH_FREE(scratch);
1262         }
1263         if (err)
1264                 goto mpol_out;
1265
1266         err = queue_pages_range(mm, start, end, nmask,
1267                           flags | MPOL_MF_INVERT, &pagelist);
1268         if (!err)
1269                 err = mbind_range(mm, start, end, new);
1270
1271         if (!err) {
1272                 int nr_failed = 0;
1273
1274                 if (!list_empty(&pagelist)) {
1275                         WARN_ON_ONCE(flags & MPOL_MF_LAZY);
1276                         nr_failed = migrate_pages(&pagelist, new_page, NULL,
1277                                 start, MIGRATE_SYNC, MR_MEMPOLICY_MBIND);
1278                         if (nr_failed)
1279                                 putback_movable_pages(&pagelist);
1280                 }
1281
1282                 if (nr_failed && (flags & MPOL_MF_STRICT))
1283                         err = -EIO;
1284         } else
1285                 putback_movable_pages(&pagelist);
1286
1287         up_write(&mm->mmap_sem);
1288  mpol_out:
1289         mpol_put(new);
1290         return err;
1291 }
1292
1293 /*
1294  * User space interface with variable sized bitmaps for nodelists.
1295  */
1296
1297 /* Copy a node mask from user space. */
1298 static int get_nodes(nodemask_t *nodes, const unsigned long __user *nmask,
1299                      unsigned long maxnode)
1300 {
1301         unsigned long k;
1302         unsigned long nlongs;
1303         unsigned long endmask;
1304
1305         --maxnode;
1306         nodes_clear(*nodes);
1307         if (maxnode == 0 || !nmask)
1308                 return 0;
1309         if (maxnode > PAGE_SIZE*BITS_PER_BYTE)
1310                 return -EINVAL;
1311
1312         nlongs = BITS_TO_LONGS(maxnode);
1313         if ((maxnode % BITS_PER_LONG) == 0)
1314                 endmask = ~0UL;
1315         else
1316                 endmask = (1UL << (maxnode % BITS_PER_LONG)) - 1;
1317
1318         /* When the user specified more nodes than supported just check
1319            if the non supported part is all zero. */
1320         if (nlongs > BITS_TO_LONGS(MAX_NUMNODES)) {
1321                 if (nlongs > PAGE_SIZE/sizeof(long))
1322                         return -EINVAL;
1323                 for (k = BITS_TO_LONGS(MAX_NUMNODES); k < nlongs; k++) {
1324                         unsigned long t;
1325                         if (get_user(t, nmask + k))
1326                                 return -EFAULT;
1327                         if (k == nlongs - 1) {
1328                                 if (t & endmask)
1329                                         return -EINVAL;
1330                         } else if (t)
1331                                 return -EINVAL;
1332                 }
1333                 nlongs = BITS_TO_LONGS(MAX_NUMNODES);
1334                 endmask = ~0UL;
1335         }
1336
1337         if (copy_from_user(nodes_addr(*nodes), nmask, nlongs*sizeof(unsigned long)))
1338                 return -EFAULT;
1339         nodes_addr(*nodes)[nlongs-1] &= endmask;
1340         return 0;
1341 }
1342
1343 /* Copy a kernel node mask to user space */
1344 static int copy_nodes_to_user(unsigned long __user *mask, unsigned long maxnode,
1345                               nodemask_t *nodes)
1346 {
1347         unsigned long copy = ALIGN(maxnode-1, 64) / 8;
1348         const int nbytes = BITS_TO_LONGS(MAX_NUMNODES) * sizeof(long);
1349
1350         if (copy > nbytes) {
1351                 if (copy > PAGE_SIZE)
1352                         return -EINVAL;
1353                 if (clear_user((char __user *)mask + nbytes, copy - nbytes))
1354                         return -EFAULT;
1355                 copy = nbytes;
1356         }
1357         return copy_to_user(mask, nodes_addr(*nodes), copy) ? -EFAULT : 0;
1358 }
1359
1360 SYSCALL_DEFINE6(mbind, unsigned long, start, unsigned long, len,
1361                 unsigned long, mode, const unsigned long __user *, nmask,
1362                 unsigned long, maxnode, unsigned, flags)
1363 {
1364         nodemask_t nodes;
1365         int err;
1366         unsigned short mode_flags;
1367
1368         mode_flags = mode & MPOL_MODE_FLAGS;
1369         mode &= ~MPOL_MODE_FLAGS;
1370         if (mode >= MPOL_MAX)
1371                 return -EINVAL;
1372         if ((mode_flags & MPOL_F_STATIC_NODES) &&
1373             (mode_flags & MPOL_F_RELATIVE_NODES))
1374                 return -EINVAL;
1375         err = get_nodes(&nodes, nmask, maxnode);
1376         if (err)
1377                 return err;
1378         return do_mbind(start, len, mode, mode_flags, &nodes, flags);
1379 }
1380
1381 /* Set the process memory policy */
1382 SYSCALL_DEFINE3(set_mempolicy, int, mode, const unsigned long __user *, nmask,
1383                 unsigned long, maxnode)
1384 {
1385         int err;
1386         nodemask_t nodes;
1387         unsigned short flags;
1388
1389         flags = mode & MPOL_MODE_FLAGS;
1390         mode &= ~MPOL_MODE_FLAGS;
1391         if ((unsigned int)mode >= MPOL_MAX)
1392                 return -EINVAL;
1393         if ((flags & MPOL_F_STATIC_NODES) && (flags & MPOL_F_RELATIVE_NODES))
1394                 return -EINVAL;
1395         err = get_nodes(&nodes, nmask, maxnode);
1396         if (err)
1397                 return err;
1398         return do_set_mempolicy(mode, flags, &nodes);
1399 }
1400
1401 SYSCALL_DEFINE4(migrate_pages, pid_t, pid, unsigned long, maxnode,
1402                 const unsigned long __user *, old_nodes,
1403                 const unsigned long __user *, new_nodes)
1404 {
1405         const struct cred *cred = current_cred(), *tcred;
1406         struct mm_struct *mm = NULL;
1407         struct task_struct *task;
1408         nodemask_t task_nodes;
1409         int err;
1410         nodemask_t *old;
1411         nodemask_t *new;
1412         NODEMASK_SCRATCH(scratch);
1413
1414         if (!scratch)
1415                 return -ENOMEM;
1416
1417         old = &scratch->mask1;
1418         new = &scratch->mask2;
1419
1420         err = get_nodes(old, old_nodes, maxnode);
1421         if (err)
1422                 goto out;
1423
1424         err = get_nodes(new, new_nodes, maxnode);
1425         if (err)
1426                 goto out;
1427
1428         /* Find the mm_struct */
1429         rcu_read_lock();
1430         task = pid ? find_task_by_vpid(pid) : current;
1431         if (!task) {
1432                 rcu_read_unlock();
1433                 err = -ESRCH;
1434                 goto out;
1435         }
1436         get_task_struct(task);
1437
1438         err = -EINVAL;
1439
1440         /*
1441          * Check if this process has the right to modify the specified
1442          * process. The right exists if the process has administrative
1443          * capabilities, superuser privileges or the same
1444          * userid as the target process.
1445          */
1446         tcred = __task_cred(task);
1447         if (!uid_eq(cred->euid, tcred->suid) && !uid_eq(cred->euid, tcred->uid) &&
1448             !uid_eq(cred->uid,  tcred->suid) && !uid_eq(cred->uid,  tcred->uid) &&
1449             !capable(CAP_SYS_NICE)) {
1450                 rcu_read_unlock();
1451                 err = -EPERM;
1452                 goto out_put;
1453         }
1454         rcu_read_unlock();
1455
1456         task_nodes = cpuset_mems_allowed(task);
1457         /* Is the user allowed to access the target nodes? */
1458         if (!nodes_subset(*new, task_nodes) && !capable(CAP_SYS_NICE)) {
1459                 err = -EPERM;
1460                 goto out_put;
1461         }
1462
1463         if (!nodes_subset(*new, node_states[N_MEMORY])) {
1464                 err = -EINVAL;
1465                 goto out_put;
1466         }
1467
1468         err = security_task_movememory(task);
1469         if (err)
1470                 goto out_put;
1471
1472         mm = get_task_mm(task);
1473         put_task_struct(task);
1474
1475         if (!mm) {
1476                 err = -EINVAL;
1477                 goto out;
1478         }
1479
1480         err = do_migrate_pages(mm, old, new,
1481                 capable(CAP_SYS_NICE) ? MPOL_MF_MOVE_ALL : MPOL_MF_MOVE);
1482
1483         mmput(mm);
1484 out:
1485         NODEMASK_SCRATCH_FREE(scratch);
1486
1487         return err;
1488
1489 out_put:
1490         put_task_struct(task);
1491         goto out;
1492
1493 }
1494
1495
1496 /* Retrieve NUMA policy */
1497 SYSCALL_DEFINE5(get_mempolicy, int __user *, policy,
1498                 unsigned long __user *, nmask, unsigned long, maxnode,
1499                 unsigned long, addr, unsigned long, flags)
1500 {
1501         int err;
1502         int uninitialized_var(pval);
1503         nodemask_t nodes;
1504
1505         if (nmask != NULL && maxnode < MAX_NUMNODES)
1506                 return -EINVAL;
1507
1508         err = do_get_mempolicy(&pval, &nodes, addr, flags);
1509
1510         if (err)
1511                 return err;
1512
1513         if (policy && put_user(pval, policy))
1514                 return -EFAULT;
1515
1516         if (nmask)
1517                 err = copy_nodes_to_user(nmask, maxnode, &nodes);
1518
1519         return err;
1520 }
1521
1522 #ifdef CONFIG_COMPAT
1523
1524 COMPAT_SYSCALL_DEFINE5(get_mempolicy, int __user *, policy,
1525                        compat_ulong_t __user *, nmask,
1526                        compat_ulong_t, maxnode,
1527                        compat_ulong_t, addr, compat_ulong_t, flags)
1528 {
1529         long err;
1530         unsigned long __user *nm = NULL;
1531         unsigned long nr_bits, alloc_size;
1532         DECLARE_BITMAP(bm, MAX_NUMNODES);
1533
1534         nr_bits = min_t(unsigned long, maxnode-1, MAX_NUMNODES);
1535         alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8;
1536
1537         if (nmask)
1538                 nm = compat_alloc_user_space(alloc_size);
1539
1540         err = sys_get_mempolicy(policy, nm, nr_bits+1, addr, flags);
1541
1542         if (!err && nmask) {
1543                 unsigned long copy_size;
1544                 copy_size = min_t(unsigned long, sizeof(bm), alloc_size);
1545                 err = copy_from_user(bm, nm, copy_size);
1546                 /* ensure entire bitmap is zeroed */
1547                 err |= clear_user(nmask, ALIGN(maxnode-1, 8) / 8);
1548                 err |= compat_put_bitmap(nmask, bm, nr_bits);
1549         }
1550
1551         return err;
1552 }
1553
1554 COMPAT_SYSCALL_DEFINE3(set_mempolicy, int, mode, compat_ulong_t __user *, nmask,
1555                        compat_ulong_t, maxnode)
1556 {
1557         unsigned long __user *nm = NULL;
1558         unsigned long nr_bits, alloc_size;
1559         DECLARE_BITMAP(bm, MAX_NUMNODES);
1560
1561         nr_bits = min_t(unsigned long, maxnode-1, MAX_NUMNODES);
1562         alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8;
1563
1564         if (nmask) {
1565                 if (compat_get_bitmap(bm, nmask, nr_bits))
1566                         return -EFAULT;
1567                 nm = compat_alloc_user_space(alloc_size);
1568                 if (copy_to_user(nm, bm, alloc_size))
1569                         return -EFAULT;
1570         }
1571
1572         return sys_set_mempolicy(mode, nm, nr_bits+1);
1573 }
1574
1575 COMPAT_SYSCALL_DEFINE6(mbind, compat_ulong_t, start, compat_ulong_t, len,
1576                        compat_ulong_t, mode, compat_ulong_t __user *, nmask,
1577                        compat_ulong_t, maxnode, compat_ulong_t, flags)
1578 {
1579         unsigned long __user *nm = NULL;
1580         unsigned long nr_bits, alloc_size;
1581         nodemask_t bm;
1582
1583         nr_bits = min_t(unsigned long, maxnode-1, MAX_NUMNODES);
1584         alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8;
1585
1586         if (nmask) {
1587                 if (compat_get_bitmap(nodes_addr(bm), nmask, nr_bits))
1588                         return -EFAULT;
1589                 nm = compat_alloc_user_space(alloc_size);
1590                 if (copy_to_user(nm, nodes_addr(bm), alloc_size))
1591                         return -EFAULT;
1592         }
1593
1594         return sys_mbind(start, len, mode, nm, nr_bits+1, flags);
1595 }
1596
1597 #endif
1598
1599 /*
1600  * get_vma_policy(@task, @vma, @addr)
1601  * @task: task for fallback if vma policy == default
1602  * @vma: virtual memory area whose policy is sought
1603  * @addr: address in @vma for shared policy lookup
1604  *
1605  * Returns effective policy for a VMA at specified address.
1606  * Falls back to @task or system default policy, as necessary.
1607  * Current or other task's task mempolicy and non-shared vma policies must be
1608  * protected by task_lock(task) by the caller.
1609  * Shared policies [those marked as MPOL_F_SHARED] require an extra reference
1610  * count--added by the get_policy() vm_op, as appropriate--to protect against
1611  * freeing by another task.  It is the caller's responsibility to free the
1612  * extra reference for shared policies.
1613  */
1614 struct mempolicy *get_vma_policy(struct task_struct *task,
1615                 struct vm_area_struct *vma, unsigned long addr)
1616 {
1617         struct mempolicy *pol = get_task_policy(task);
1618
1619         if (vma) {
1620                 if (vma->vm_ops && vma->vm_ops->get_policy) {
1621                         struct mempolicy *vpol = vma->vm_ops->get_policy(vma,
1622                                                                         addr);
1623                         if (vpol)
1624                                 pol = vpol;
1625                 } else if (vma->vm_policy) {
1626                         pol = vma->vm_policy;
1627
1628                         /*
1629                          * shmem_alloc_page() passes MPOL_F_SHARED policy with
1630                          * a pseudo vma whose vma->vm_ops=NULL. Take a reference
1631                          * count on these policies which will be dropped by
1632                          * mpol_cond_put() later
1633                          */
1634                         if (mpol_needs_cond_ref(pol))
1635                                 mpol_get(pol);
1636                 }
1637         }
1638         if (!pol)
1639                 pol = &default_policy;
1640         return pol;
1641 }
1642
1643 bool vma_policy_mof(struct task_struct *task, struct vm_area_struct *vma)
1644 {
1645         struct mempolicy *pol = get_task_policy(task);
1646         if (vma) {
1647                 if (vma->vm_ops && vma->vm_ops->get_policy) {
1648                         bool ret = false;
1649
1650                         pol = vma->vm_ops->get_policy(vma, vma->vm_start);
1651                         if (pol && (pol->flags & MPOL_F_MOF))
1652                                 ret = true;
1653                         mpol_cond_put(pol);
1654
1655                         return ret;
1656                 } else if (vma->vm_policy) {
1657                         pol = vma->vm_policy;
1658                 }
1659         }
1660
1661         if (!pol)
1662                 return default_policy.flags & MPOL_F_MOF;
1663
1664         return pol->flags & MPOL_F_MOF;
1665 }
1666
1667 static int apply_policy_zone(struct mempolicy *policy, enum zone_type zone)
1668 {
1669         enum zone_type dynamic_policy_zone = policy_zone;
1670
1671         BUG_ON(dynamic_policy_zone == ZONE_MOVABLE);
1672
1673         /*
1674          * if policy->v.nodes has movable memory only,
1675          * we apply policy when gfp_zone(gfp) = ZONE_MOVABLE only.
1676          *
1677          * policy->v.nodes is intersect with node_states[N_MEMORY].
1678          * so if the following test faile, it implies
1679          * policy->v.nodes has movable memory only.
1680          */
1681         if (!nodes_intersects(policy->v.nodes, node_states[N_HIGH_MEMORY]))
1682                 dynamic_policy_zone = ZONE_MOVABLE;
1683
1684         return zone >= dynamic_policy_zone;
1685 }
1686
1687 /*
1688  * Return a nodemask representing a mempolicy for filtering nodes for
1689  * page allocation
1690  */
1691 static nodemask_t *policy_nodemask(gfp_t gfp, struct mempolicy *policy)
1692 {
1693         /* Lower zones don't get a nodemask applied for MPOL_BIND */
1694         if (unlikely(policy->mode == MPOL_BIND) &&
1695                         apply_policy_zone(policy, gfp_zone(gfp)) &&
1696                         cpuset_nodemask_valid_mems_allowed(&policy->v.nodes))
1697                 return &policy->v.nodes;
1698
1699         return NULL;
1700 }
1701
1702 /* Return a zonelist indicated by gfp for node representing a mempolicy */
1703 static struct zonelist *policy_zonelist(gfp_t gfp, struct mempolicy *policy,
1704         int nd)
1705 {
1706         switch (policy->mode) {
1707         case MPOL_PREFERRED:
1708                 if (!(policy->flags & MPOL_F_LOCAL))
1709                         nd = policy->v.preferred_node;
1710                 break;
1711         case MPOL_BIND:
1712                 /*
1713                  * Normally, MPOL_BIND allocations are node-local within the
1714                  * allowed nodemask.  However, if __GFP_THISNODE is set and the
1715                  * current node isn't part of the mask, we use the zonelist for
1716                  * the first node in the mask instead.
1717                  */
1718                 if (unlikely(gfp & __GFP_THISNODE) &&
1719                                 unlikely(!node_isset(nd, policy->v.nodes)))
1720                         nd = first_node(policy->v.nodes);
1721                 break;
1722         default:
1723                 BUG();
1724         }
1725         return node_zonelist(nd, gfp);
1726 }
1727
1728 /* Do dynamic interleaving for a process */
1729 static unsigned interleave_nodes(struct mempolicy *policy)
1730 {
1731         unsigned nid, next;
1732         struct task_struct *me = current;
1733
1734         nid = me->il_next;
1735         next = next_node(nid, policy->v.nodes);
1736         if (next >= MAX_NUMNODES)
1737                 next = first_node(policy->v.nodes);
1738         if (next < MAX_NUMNODES)
1739                 me->il_next = next;
1740         return nid;
1741 }
1742
1743 /*
1744  * Depending on the memory policy provide a node from which to allocate the
1745  * next slab entry.
1746  */
1747 unsigned int mempolicy_slab_node(void)
1748 {
1749         struct mempolicy *policy;
1750         int node = numa_mem_id();
1751
1752         if (in_interrupt())
1753                 return node;
1754
1755         policy = current->mempolicy;
1756         if (!policy || policy->flags & MPOL_F_LOCAL)
1757                 return node;
1758
1759         switch (policy->mode) {
1760         case MPOL_PREFERRED:
1761                 /*
1762                  * handled MPOL_F_LOCAL above
1763                  */
1764                 return policy->v.preferred_node;
1765
1766         case MPOL_INTERLEAVE:
1767                 return interleave_nodes(policy);
1768
1769         case MPOL_BIND: {
1770                 /*
1771                  * Follow bind policy behavior and start allocation at the
1772                  * first node.
1773                  */
1774                 struct zonelist *zonelist;
1775                 struct zone *zone;
1776                 enum zone_type highest_zoneidx = gfp_zone(GFP_KERNEL);
1777                 zonelist = &NODE_DATA(node)->node_zonelists[0];
1778                 (void)first_zones_zonelist(zonelist, highest_zoneidx,
1779                                                         &policy->v.nodes,
1780                                                         &zone);
1781                 return zone ? zone->node : node;
1782         }
1783
1784         default:
1785                 BUG();
1786         }
1787 }
1788
1789 /* Do static interleaving for a VMA with known offset. */
1790 static unsigned offset_il_node(struct mempolicy *pol,
1791                 struct vm_area_struct *vma, unsigned long off)
1792 {
1793         unsigned nnodes = nodes_weight(pol->v.nodes);
1794         unsigned target;
1795         int c;
1796         int nid = NUMA_NO_NODE;
1797
1798         if (!nnodes)
1799                 return numa_node_id();
1800         target = (unsigned int)off % nnodes;
1801         c = 0;
1802         do {
1803                 nid = next_node(nid, pol->v.nodes);
1804                 c++;
1805         } while (c <= target);
1806         return nid;
1807 }
1808
1809 /* Determine a node number for interleave */
1810 static inline unsigned interleave_nid(struct mempolicy *pol,
1811                  struct vm_area_struct *vma, unsigned long addr, int shift)
1812 {
1813         if (vma) {
1814                 unsigned long off;
1815
1816                 /*
1817                  * for small pages, there is no difference between
1818                  * shift and PAGE_SHIFT, so the bit-shift is safe.
1819                  * for huge pages, since vm_pgoff is in units of small
1820                  * pages, we need to shift off the always 0 bits to get
1821                  * a useful offset.
1822                  */
1823                 BUG_ON(shift < PAGE_SHIFT);
1824                 off = vma->vm_pgoff >> (shift - PAGE_SHIFT);
1825                 off += (addr - vma->vm_start) >> shift;
1826                 return offset_il_node(pol, vma, off);
1827         } else
1828                 return interleave_nodes(pol);
1829 }
1830
1831 /*
1832  * Return the bit number of a random bit set in the nodemask.
1833  * (returns NUMA_NO_NODE if nodemask is empty)
1834  */
1835 int node_random(const nodemask_t *maskp)
1836 {
1837         int w, bit = NUMA_NO_NODE;
1838
1839         w = nodes_weight(*maskp);
1840         if (w)
1841                 bit = bitmap_ord_to_pos(maskp->bits,
1842                         get_random_int() % w, MAX_NUMNODES);
1843         return bit;
1844 }
1845
1846 #ifdef CONFIG_HUGETLBFS
1847 /*
1848  * huge_zonelist(@vma, @addr, @gfp_flags, @mpol)
1849  * @vma: virtual memory area whose policy is sought
1850  * @addr: address in @vma for shared policy lookup and interleave policy
1851  * @gfp_flags: for requested zone
1852  * @mpol: pointer to mempolicy pointer for reference counted mempolicy
1853  * @nodemask: pointer to nodemask pointer for MPOL_BIND nodemask
1854  *
1855  * Returns a zonelist suitable for a huge page allocation and a pointer
1856  * to the struct mempolicy for conditional unref after allocation.
1857  * If the effective policy is 'BIND, returns a pointer to the mempolicy's
1858  * @nodemask for filtering the zonelist.
1859  *
1860  * Must be protected by read_mems_allowed_begin()
1861  */
1862 struct zonelist *huge_zonelist(struct vm_area_struct *vma, unsigned long addr,
1863                                 gfp_t gfp_flags, struct mempolicy **mpol,
1864                                 nodemask_t **nodemask)
1865 {
1866         struct zonelist *zl;
1867
1868         *mpol = get_vma_policy(current, vma, addr);
1869         *nodemask = NULL;       /* assume !MPOL_BIND */
1870
1871         if (unlikely((*mpol)->mode == MPOL_INTERLEAVE)) {
1872                 zl = node_zonelist(interleave_nid(*mpol, vma, addr,
1873                                 huge_page_shift(hstate_vma(vma))), gfp_flags);
1874         } else {
1875                 zl = policy_zonelist(gfp_flags, *mpol, numa_node_id());
1876                 if ((*mpol)->mode == MPOL_BIND)
1877                         *nodemask = &(*mpol)->v.nodes;
1878         }
1879         return zl;
1880 }
1881
1882 /*
1883  * init_nodemask_of_mempolicy
1884  *
1885  * If the current task's mempolicy is "default" [NULL], return 'false'
1886  * to indicate default policy.  Otherwise, extract the policy nodemask
1887  * for 'bind' or 'interleave' policy into the argument nodemask, or
1888  * initialize the argument nodemask to contain the single node for
1889  * 'preferred' or 'local' policy and return 'true' to indicate presence
1890  * of non-default mempolicy.
1891  *
1892  * We don't bother with reference counting the mempolicy [mpol_get/put]
1893  * because the current task is examining it's own mempolicy and a task's
1894  * mempolicy is only ever changed by the task itself.
1895  *
1896  * N.B., it is the caller's responsibility to free a returned nodemask.
1897  */
1898 bool init_nodemask_of_mempolicy(nodemask_t *mask)
1899 {
1900         struct mempolicy *mempolicy;
1901         int nid;
1902
1903         if (!(mask && current->mempolicy))
1904                 return false;
1905
1906         task_lock(current);
1907         mempolicy = current->mempolicy;
1908         switch (mempolicy->mode) {
1909         case MPOL_PREFERRED:
1910                 if (mempolicy->flags & MPOL_F_LOCAL)
1911                         nid = numa_node_id();
1912                 else
1913                         nid = mempolicy->v.preferred_node;
1914                 init_nodemask_of_node(mask, nid);
1915                 break;
1916
1917         case MPOL_BIND:
1918                 /* Fall through */
1919         case MPOL_INTERLEAVE:
1920                 *mask =  mempolicy->v.nodes;
1921                 break;
1922
1923         default:
1924                 BUG();
1925         }
1926         task_unlock(current);
1927
1928         return true;
1929 }
1930 #endif
1931
1932 /*
1933  * mempolicy_nodemask_intersects
1934  *
1935  * If tsk's mempolicy is "default" [NULL], return 'true' to indicate default
1936  * policy.  Otherwise, check for intersection between mask and the policy
1937  * nodemask for 'bind' or 'interleave' policy.  For 'perferred' or 'local'
1938  * policy, always return true since it may allocate elsewhere on fallback.
1939  *
1940  * Takes task_lock(tsk) to prevent freeing of its mempolicy.
1941  */
1942 bool mempolicy_nodemask_intersects(struct task_struct *tsk,
1943                                         const nodemask_t *mask)
1944 {
1945         struct mempolicy *mempolicy;
1946         bool ret = true;
1947
1948         if (!mask)
1949                 return ret;
1950         task_lock(tsk);
1951         mempolicy = tsk->mempolicy;
1952         if (!mempolicy)
1953                 goto out;
1954
1955         switch (mempolicy->mode) {
1956         case MPOL_PREFERRED:
1957                 /*
1958                  * MPOL_PREFERRED and MPOL_F_LOCAL are only preferred nodes to
1959                  * allocate from, they may fallback to other nodes when oom.
1960                  * Thus, it's possible for tsk to have allocated memory from
1961                  * nodes in mask.
1962                  */
1963                 break;
1964         case MPOL_BIND:
1965         case MPOL_INTERLEAVE:
1966                 ret = nodes_intersects(mempolicy->v.nodes, *mask);
1967                 break;
1968         default:
1969                 BUG();
1970         }
1971 out:
1972         task_unlock(tsk);
1973         return ret;
1974 }
1975
1976 /* Allocate a page in interleaved policy.
1977    Own path because it needs to do special accounting. */
1978 static struct page *alloc_page_interleave(gfp_t gfp, unsigned order,
1979                                         unsigned nid)
1980 {
1981         struct zonelist *zl;
1982         struct page *page;
1983
1984         zl = node_zonelist(nid, gfp);
1985         page = __alloc_pages(gfp, order, zl);
1986         if (page && page_zone(page) == zonelist_zone(&zl->_zonerefs[0]))
1987                 inc_zone_page_state(page, NUMA_INTERLEAVE_HIT);
1988         return page;
1989 }
1990
1991 /**
1992  *      alloc_pages_vma - Allocate a page for a VMA.
1993  *
1994  *      @gfp:
1995  *      %GFP_USER    user allocation.
1996  *      %GFP_KERNEL  kernel allocations,
1997  *      %GFP_HIGHMEM highmem/user allocations,
1998  *      %GFP_FS      allocation should not call back into a file system.
1999  *      %GFP_ATOMIC  don't sleep.
2000  *
2001  *      @order:Order of the GFP allocation.
2002  *      @vma:  Pointer to VMA or NULL if not available.
2003  *      @addr: Virtual Address of the allocation. Must be inside the VMA.
2004  *
2005  *      This function allocates a page from the kernel page pool and applies
2006  *      a NUMA policy associated with the VMA or the current process.
2007  *      When VMA is not NULL caller must hold down_read on the mmap_sem of the
2008  *      mm_struct of the VMA to prevent it from going away. Should be used for
2009  *      all allocations for pages that will be mapped into
2010  *      user space. Returns NULL when no page can be allocated.
2011  *
2012  *      Should be called with the mm_sem of the vma hold.
2013  */
2014 struct page *
2015 alloc_pages_vma(gfp_t gfp, int order, struct vm_area_struct *vma,
2016                 unsigned long addr, int node)
2017 {
2018         struct mempolicy *pol;
2019         struct page *page;
2020         unsigned int cpuset_mems_cookie;
2021
2022 retry_cpuset:
2023         pol = get_vma_policy(current, vma, addr);
2024         cpuset_mems_cookie = read_mems_allowed_begin();
2025
2026         if (unlikely(pol->mode == MPOL_INTERLEAVE)) {
2027                 unsigned nid;
2028
2029                 nid = interleave_nid(pol, vma, addr, PAGE_SHIFT + order);
2030                 mpol_cond_put(pol);
2031                 page = alloc_page_interleave(gfp, order, nid);
2032                 if (unlikely(!page && read_mems_allowed_retry(cpuset_mems_cookie)))
2033                         goto retry_cpuset;
2034
2035                 return page;
2036         }
2037         page = __alloc_pages_nodemask(gfp, order,
2038                                       policy_zonelist(gfp, pol, node),
2039                                       policy_nodemask(gfp, pol));
2040         if (unlikely(mpol_needs_cond_ref(pol)))
2041                 __mpol_put(pol);
2042         if (unlikely(!page && read_mems_allowed_retry(cpuset_mems_cookie)))
2043                 goto retry_cpuset;
2044         return page;
2045 }
2046
2047 /**
2048  *      alloc_pages_current - Allocate pages.
2049  *
2050  *      @gfp:
2051  *              %GFP_USER   user allocation,
2052  *              %GFP_KERNEL kernel allocation,
2053  *              %GFP_HIGHMEM highmem allocation,
2054  *              %GFP_FS     don't call back into a file system.
2055  *              %GFP_ATOMIC don't sleep.
2056  *      @order: Power of two of allocation size in pages. 0 is a single page.
2057  *
2058  *      Allocate a page from the kernel page pool.  When not in
2059  *      interrupt context and apply the current process NUMA policy.
2060  *      Returns NULL when no page can be allocated.
2061  *
2062  *      Don't call cpuset_update_task_memory_state() unless
2063  *      1) it's ok to take cpuset_sem (can WAIT), and
2064  *      2) allocating for current task (not interrupt).
2065  */
2066 struct page *alloc_pages_current(gfp_t gfp, unsigned order)
2067 {
2068         struct mempolicy *pol = get_task_policy(current);
2069         struct page *page;
2070         unsigned int cpuset_mems_cookie;
2071
2072         if (!pol || in_interrupt() || (gfp & __GFP_THISNODE))
2073                 pol = &default_policy;
2074
2075 retry_cpuset:
2076         cpuset_mems_cookie = read_mems_allowed_begin();
2077
2078         /*
2079          * No reference counting needed for current->mempolicy
2080          * nor system default_policy
2081          */
2082         if (pol->mode == MPOL_INTERLEAVE)
2083                 page = alloc_page_interleave(gfp, order, interleave_nodes(pol));
2084         else
2085                 page = __alloc_pages_nodemask(gfp, order,
2086                                 policy_zonelist(gfp, pol, numa_node_id()),
2087                                 policy_nodemask(gfp, pol));
2088
2089         if (unlikely(!page && read_mems_allowed_retry(cpuset_mems_cookie)))
2090                 goto retry_cpuset;
2091
2092         return page;
2093 }
2094 EXPORT_SYMBOL(alloc_pages_current);
2095
2096 int vma_dup_policy(struct vm_area_struct *src, struct vm_area_struct *dst)
2097 {
2098         struct mempolicy *pol = mpol_dup(vma_policy(src));
2099
2100         if (IS_ERR(pol))
2101                 return PTR_ERR(pol);
2102         dst->vm_policy = pol;
2103         return 0;
2104 }
2105
2106 /*
2107  * If mpol_dup() sees current->cpuset == cpuset_being_rebound, then it
2108  * rebinds the mempolicy its copying by calling mpol_rebind_policy()
2109  * with the mems_allowed returned by cpuset_mems_allowed().  This
2110  * keeps mempolicies cpuset relative after its cpuset moves.  See
2111  * further kernel/cpuset.c update_nodemask().
2112  *
2113  * current's mempolicy may be rebinded by the other task(the task that changes
2114  * cpuset's mems), so we needn't do rebind work for current task.
2115  */
2116
2117 /* Slow path of a mempolicy duplicate */
2118 struct mempolicy *__mpol_dup(struct mempolicy *old)
2119 {
2120         struct mempolicy *new = kmem_cache_alloc(policy_cache, GFP_KERNEL);
2121
2122         if (!new)
2123                 return ERR_PTR(-ENOMEM);
2124
2125         /* task's mempolicy is protected by alloc_lock */
2126         if (old == current->mempolicy) {
2127                 task_lock(current);
2128                 *new = *old;
2129                 task_unlock(current);
2130         } else
2131                 *new = *old;
2132
2133         if (current_cpuset_is_being_rebound()) {
2134                 nodemask_t mems = cpuset_mems_allowed(current);
2135                 if (new->flags & MPOL_F_REBINDING)
2136                         mpol_rebind_policy(new, &mems, MPOL_REBIND_STEP2);
2137                 else
2138                         mpol_rebind_policy(new, &mems, MPOL_REBIND_ONCE);
2139         }
2140         atomic_set(&new->refcnt, 1);
2141         return new;
2142 }
2143
2144 /* Slow path of a mempolicy comparison */
2145 bool __mpol_equal(struct mempolicy *a, struct mempolicy *b)
2146 {
2147         if (!a || !b)
2148                 return false;
2149         if (a->mode != b->mode)
2150                 return false;
2151         if (a->flags != b->flags)
2152                 return false;
2153         if (mpol_store_user_nodemask(a))
2154                 if (!nodes_equal(a->w.user_nodemask, b->w.user_nodemask))
2155                         return false;
2156
2157         switch (a->mode) {
2158         case MPOL_BIND:
2159                 /* Fall through */
2160         case MPOL_INTERLEAVE:
2161                 return !!nodes_equal(a->v.nodes, b->v.nodes);
2162         case MPOL_PREFERRED:
2163                 /* a's ->flags is the same as b's */
2164                 if (a->flags & MPOL_F_LOCAL)
2165                         return true;
2166                 return a->v.preferred_node == b->v.preferred_node;
2167         default:
2168                 BUG();
2169                 return false;
2170         }
2171 }
2172
2173 /*
2174  * Shared memory backing store policy support.
2175  *
2176  * Remember policies even when nobody has shared memory mapped.
2177  * The policies are kept in Red-Black tree linked from the inode.
2178  * They are protected by the sp->lock spinlock, which should be held
2179  * for any accesses to the tree.
2180  */
2181
2182 /* lookup first element intersecting start-end */
2183 /* Caller holds sp->lock */
2184 static struct sp_node *
2185 sp_lookup(struct shared_policy *sp, unsigned long start, unsigned long end)
2186 {
2187         struct rb_node *n = sp->root.rb_node;
2188
2189         while (n) {
2190                 struct sp_node *p = rb_entry(n, struct sp_node, nd);
2191
2192                 if (start >= p->end)
2193                         n = n->rb_right;
2194                 else if (end <= p->start)
2195                         n = n->rb_left;
2196                 else
2197                         break;
2198         }
2199         if (!n)
2200                 return NULL;
2201         for (;;) {
2202                 struct sp_node *w = NULL;
2203                 struct rb_node *prev = rb_prev(n);
2204                 if (!prev)
2205                         break;
2206                 w = rb_entry(prev, struct sp_node, nd);
2207                 if (w->end <= start)
2208                         break;
2209                 n = prev;
2210         }
2211         return rb_entry(n, struct sp_node, nd);
2212 }
2213
2214 /* Insert a new shared policy into the list. */
2215 /* Caller holds sp->lock */
2216 static void sp_insert(struct shared_policy *sp, struct sp_node *new)
2217 {
2218         struct rb_node **p = &sp->root.rb_node;
2219         struct rb_node *parent = NULL;
2220         struct sp_node *nd;
2221
2222         while (*p) {
2223                 parent = *p;
2224                 nd = rb_entry(parent, struct sp_node, nd);
2225                 if (new->start < nd->start)
2226                         p = &(*p)->rb_left;
2227                 else if (new->end > nd->end)
2228                         p = &(*p)->rb_right;
2229                 else
2230                         BUG();
2231         }
2232         rb_link_node(&new->nd, parent, p);
2233         rb_insert_color(&new->nd, &sp->root);
2234         pr_debug("inserting %lx-%lx: %d\n", new->start, new->end,
2235                  new->policy ? new->policy->mode : 0);
2236 }
2237
2238 /* Find shared policy intersecting idx */
2239 struct mempolicy *
2240 mpol_shared_policy_lookup(struct shared_policy *sp, unsigned long idx)
2241 {
2242         struct mempolicy *pol = NULL;
2243         struct sp_node *sn;
2244
2245         if (!sp->root.rb_node)
2246                 return NULL;
2247         spin_lock(&sp->lock);
2248         sn = sp_lookup(sp, idx, idx+1);
2249         if (sn) {
2250                 mpol_get(sn->policy);
2251                 pol = sn->policy;
2252         }
2253         spin_unlock(&sp->lock);
2254         return pol;
2255 }
2256
2257 static void sp_free(struct sp_node *n)
2258 {
2259         mpol_put(n->policy);
2260         kmem_cache_free(sn_cache, n);
2261 }
2262
2263 /**
2264  * mpol_misplaced - check whether current page node is valid in policy
2265  *
2266  * @page: page to be checked
2267  * @vma: vm area where page mapped
2268  * @addr: virtual address where page mapped
2269  *
2270  * Lookup current policy node id for vma,addr and "compare to" page's
2271  * node id.
2272  *
2273  * Returns:
2274  *      -1      - not misplaced, page is in the right node
2275  *      node    - node id where the page should be
2276  *
2277  * Policy determination "mimics" alloc_page_vma().
2278  * Called from fault path where we know the vma and faulting address.
2279  */
2280 int mpol_misplaced(struct page *page, struct vm_area_struct *vma, unsigned long addr)
2281 {
2282         struct mempolicy *pol;
2283         struct zone *zone;
2284         int curnid = page_to_nid(page);
2285         unsigned long pgoff;
2286         int thiscpu = raw_smp_processor_id();
2287         int thisnid = cpu_to_node(thiscpu);
2288         int polnid = -1;
2289         int ret = -1;
2290
2291         BUG_ON(!vma);
2292
2293         pol = get_vma_policy(current, vma, addr);
2294         if (!(pol->flags & MPOL_F_MOF))
2295                 goto out;
2296
2297         switch (pol->mode) {
2298         case MPOL_INTERLEAVE:
2299                 BUG_ON(addr >= vma->vm_end);
2300                 BUG_ON(addr < vma->vm_start);
2301
2302                 pgoff = vma->vm_pgoff;
2303                 pgoff += (addr - vma->vm_start) >> PAGE_SHIFT;
2304                 polnid = offset_il_node(pol, vma, pgoff);
2305                 break;
2306
2307         case MPOL_PREFERRED:
2308                 if (pol->flags & MPOL_F_LOCAL)
2309                         polnid = numa_node_id();
2310                 else
2311                         polnid = pol->v.preferred_node;
2312                 break;
2313
2314         case MPOL_BIND:
2315                 /*
2316                  * allows binding to multiple nodes.
2317                  * use current page if in policy nodemask,
2318                  * else select nearest allowed node, if any.
2319                  * If no allowed nodes, use current [!misplaced].
2320                  */
2321                 if (node_isset(curnid, pol->v.nodes))
2322                         goto out;
2323                 (void)first_zones_zonelist(
2324                                 node_zonelist(numa_node_id(), GFP_HIGHUSER),
2325                                 gfp_zone(GFP_HIGHUSER),
2326                                 &pol->v.nodes, &zone);
2327                 polnid = zone->node;
2328                 break;
2329
2330         default:
2331                 BUG();
2332         }
2333
2334         /* Migrate the page towards the node whose CPU is referencing it */
2335         if (pol->flags & MPOL_F_MORON) {
2336                 polnid = thisnid;
2337
2338                 if (!should_numa_migrate_memory(current, page, curnid, thiscpu))
2339                         goto out;
2340         }
2341
2342         if (curnid != polnid)
2343                 ret = polnid;
2344 out:
2345         mpol_cond_put(pol);
2346
2347         return ret;
2348 }
2349
2350 static void sp_delete(struct shared_policy *sp, struct sp_node *n)
2351 {
2352         pr_debug("deleting %lx-l%lx\n", n->start, n->end);
2353         rb_erase(&n->nd, &sp->root);
2354         sp_free(n);
2355 }
2356
2357 static void sp_node_init(struct sp_node *node, unsigned long start,
2358                         unsigned long end, struct mempolicy *pol)
2359 {
2360         node->start = start;
2361         node->end = end;
2362         node->policy = pol;
2363 }
2364
2365 static struct sp_node *sp_alloc(unsigned long start, unsigned long end,
2366                                 struct mempolicy *pol)
2367 {
2368         struct sp_node *n;
2369         struct mempolicy *newpol;
2370
2371         n = kmem_cache_alloc(sn_cache, GFP_KERNEL);
2372         if (!n)
2373                 return NULL;
2374
2375         newpol = mpol_dup(pol);
2376         if (IS_ERR(newpol)) {
2377                 kmem_cache_free(sn_cache, n);
2378                 return NULL;
2379         }
2380         newpol->flags |= MPOL_F_SHARED;
2381         sp_node_init(n, start, end, newpol);
2382
2383         return n;
2384 }
2385
2386 /* Replace a policy range. */
2387 static int shared_policy_replace(struct shared_policy *sp, unsigned long start,
2388                                  unsigned long end, struct sp_node *new)
2389 {
2390         struct sp_node *n;
2391         struct sp_node *n_new = NULL;
2392         struct mempolicy *mpol_new = NULL;
2393         int ret = 0;
2394
2395 restart:
2396         spin_lock(&sp->lock);
2397         n = sp_lookup(sp, start, end);
2398         /* Take care of old policies in the same range. */
2399         while (n && n->start < end) {
2400                 struct rb_node *next = rb_next(&n->nd);
2401                 if (n->start >= start) {
2402                         if (n->end <= end)
2403                                 sp_delete(sp, n);
2404                         else
2405                                 n->start = end;
2406                 } else {
2407                         /* Old policy spanning whole new range. */
2408                         if (n->end > end) {
2409                                 if (!n_new)
2410                                         goto alloc_new;
2411
2412                                 *mpol_new = *n->policy;
2413                                 atomic_set(&mpol_new->refcnt, 1);
2414                                 sp_node_init(n_new, end, n->end, mpol_new);
2415                                 n->end = start;
2416                                 sp_insert(sp, n_new);
2417                                 n_new = NULL;
2418                                 mpol_new = NULL;
2419                                 break;
2420                         } else
2421                                 n->end = start;
2422                 }
2423                 if (!next)
2424                         break;
2425                 n = rb_entry(next, struct sp_node, nd);
2426         }
2427         if (new)
2428                 sp_insert(sp, new);
2429         spin_unlock(&sp->lock);
2430         ret = 0;
2431
2432 err_out:
2433         if (mpol_new)
2434                 mpol_put(mpol_new);
2435         if (n_new)
2436                 kmem_cache_free(sn_cache, n_new);
2437
2438         return ret;
2439
2440 alloc_new:
2441         spin_unlock(&sp->lock);
2442         ret = -ENOMEM;
2443         n_new = kmem_cache_alloc(sn_cache, GFP_KERNEL);
2444         if (!n_new)
2445                 goto err_out;
2446         mpol_new = kmem_cache_alloc(policy_cache, GFP_KERNEL);
2447         if (!mpol_new)
2448                 goto err_out;
2449         goto restart;
2450 }
2451
2452 /**
2453  * mpol_shared_policy_init - initialize shared policy for inode
2454  * @sp: pointer to inode shared policy
2455  * @mpol:  struct mempolicy to install
2456  *
2457  * Install non-NULL @mpol in inode's shared policy rb-tree.
2458  * On entry, the current task has a reference on a non-NULL @mpol.
2459  * This must be released on exit.
2460  * This is called at get_inode() calls and we can use GFP_KERNEL.
2461  */
2462 void mpol_shared_policy_init(struct shared_policy *sp, struct mempolicy *mpol)
2463 {
2464         int ret;
2465
2466         sp->root = RB_ROOT;             /* empty tree == default mempolicy */
2467         spin_lock_init(&sp->lock);
2468
2469         if (mpol) {
2470                 struct vm_area_struct pvma;
2471                 struct mempolicy *new;
2472                 NODEMASK_SCRATCH(scratch);
2473
2474                 if (!scratch)
2475                         goto put_mpol;
2476                 /* contextualize the tmpfs mount point mempolicy */
2477                 new = mpol_new(mpol->mode, mpol->flags, &mpol->w.user_nodemask);
2478                 if (IS_ERR(new))
2479                         goto free_scratch; /* no valid nodemask intersection */
2480
2481                 task_lock(current);
2482                 ret = mpol_set_nodemask(new, &mpol->w.user_nodemask, scratch);
2483                 task_unlock(current);
2484                 if (ret)
2485                         goto put_new;
2486
2487                 /* Create pseudo-vma that contains just the policy */
2488                 memset(&pvma, 0, sizeof(struct vm_area_struct));
2489                 pvma.vm_end = TASK_SIZE;        /* policy covers entire file */
2490                 mpol_set_shared_policy(sp, &pvma, new); /* adds ref */
2491
2492 put_new:
2493                 mpol_put(new);                  /* drop initial ref */
2494 free_scratch:
2495                 NODEMASK_SCRATCH_FREE(scratch);
2496 put_mpol:
2497                 mpol_put(mpol); /* drop our incoming ref on sb mpol */
2498         }
2499 }
2500
2501 int mpol_set_shared_policy(struct shared_policy *info,
2502                         struct vm_area_struct *vma, struct mempolicy *npol)
2503 {
2504         int err;
2505         struct sp_node *new = NULL;
2506         unsigned long sz = vma_pages(vma);
2507
2508         pr_debug("set_shared_policy %lx sz %lu %d %d %lx\n",
2509                  vma->vm_pgoff,
2510                  sz, npol ? npol->mode : -1,
2511                  npol ? npol->flags : -1,
2512                  npol ? nodes_addr(npol->v.nodes)[0] : NUMA_NO_NODE);
2513
2514         if (npol) {
2515                 new = sp_alloc(vma->vm_pgoff, vma->vm_pgoff + sz, npol);
2516                 if (!new)
2517                         return -ENOMEM;
2518         }
2519         err = shared_policy_replace(info, vma->vm_pgoff, vma->vm_pgoff+sz, new);
2520         if (err && new)
2521                 sp_free(new);
2522         return err;
2523 }
2524
2525 /* Free a backing policy store on inode delete. */
2526 void mpol_free_shared_policy(struct shared_policy *p)
2527 {
2528         struct sp_node *n;
2529         struct rb_node *next;
2530
2531         if (!p->root.rb_node)
2532                 return;
2533         spin_lock(&p->lock);
2534         next = rb_first(&p->root);
2535         while (next) {
2536                 n = rb_entry(next, struct sp_node, nd);
2537                 next = rb_next(&n->nd);
2538                 sp_delete(p, n);
2539         }
2540         spin_unlock(&p->lock);
2541 }
2542
2543 #ifdef CONFIG_NUMA_BALANCING
2544 static int __initdata numabalancing_override;
2545
2546 static void __init check_numabalancing_enable(void)
2547 {
2548         bool numabalancing_default = false;
2549
2550         if (IS_ENABLED(CONFIG_NUMA_BALANCING_DEFAULT_ENABLED))
2551                 numabalancing_default = true;
2552
2553         /* Parsed by setup_numabalancing. override == 1 enables, -1 disables */
2554         if (numabalancing_override)
2555                 set_numabalancing_state(numabalancing_override == 1);
2556
2557         if (num_online_nodes() > 1 && !numabalancing_override) {
2558                 pr_info("%s automatic NUMA balancing. "
2559                         "Configure with numa_balancing= or the "
2560                         "kernel.numa_balancing sysctl",
2561                         numabalancing_default ? "Enabling" : "Disabling");
2562                 set_numabalancing_state(numabalancing_default);
2563         }
2564 }
2565
2566 static int __init setup_numabalancing(char *str)
2567 {
2568         int ret = 0;
2569         if (!str)
2570                 goto out;
2571
2572         if (!strcmp(str, "enable")) {
2573                 numabalancing_override = 1;
2574                 ret = 1;
2575         } else if (!strcmp(str, "disable")) {
2576                 numabalancing_override = -1;
2577                 ret = 1;
2578         }
2579 out:
2580         if (!ret)
2581                 pr_warn("Unable to parse numa_balancing=\n");
2582
2583         return ret;
2584 }
2585 __setup("numa_balancing=", setup_numabalancing);
2586 #else
2587 static inline void __init check_numabalancing_enable(void)
2588 {
2589 }
2590 #endif /* CONFIG_NUMA_BALANCING */
2591
2592 /* assumes fs == KERNEL_DS */
2593 void __init numa_policy_init(void)
2594 {
2595         nodemask_t interleave_nodes;
2596         unsigned long largest = 0;
2597         int nid, prefer = 0;
2598
2599         policy_cache = kmem_cache_create("numa_policy",
2600                                          sizeof(struct mempolicy),
2601                                          0, SLAB_PANIC, NULL);
2602
2603         sn_cache = kmem_cache_create("shared_policy_node",
2604                                      sizeof(struct sp_node),
2605                                      0, SLAB_PANIC, NULL);
2606
2607         for_each_node(nid) {
2608                 preferred_node_policy[nid] = (struct mempolicy) {
2609                         .refcnt = ATOMIC_INIT(1),
2610                         .mode = MPOL_PREFERRED,
2611                         .flags = MPOL_F_MOF | MPOL_F_MORON,
2612                         .v = { .preferred_node = nid, },
2613                 };
2614         }
2615
2616         /*
2617          * Set interleaving policy for system init. Interleaving is only
2618          * enabled across suitably sized nodes (default is >= 16MB), or
2619          * fall back to the largest node if they're all smaller.
2620          */
2621         nodes_clear(interleave_nodes);
2622         for_each_node_state(nid, N_MEMORY) {
2623                 unsigned long total_pages = node_present_pages(nid);
2624
2625                 /* Preserve the largest node */
2626                 if (largest < total_pages) {
2627                         largest = total_pages;
2628                         prefer = nid;
2629                 }
2630
2631                 /* Interleave this node? */
2632                 if ((total_pages << PAGE_SHIFT) >= (16 << 20))
2633                         node_set(nid, interleave_nodes);
2634         }
2635
2636         /* All too small, use the largest */
2637         if (unlikely(nodes_empty(interleave_nodes)))
2638                 node_set(prefer, interleave_nodes);
2639
2640         if (do_set_mempolicy(MPOL_INTERLEAVE, 0, &interleave_nodes))
2641                 pr_err("%s: interleaving failed\n", __func__);
2642
2643         check_numabalancing_enable();
2644 }
2645
2646 /* Reset policy of current process to default */
2647 void numa_default_policy(void)
2648 {
2649         do_set_mempolicy(MPOL_DEFAULT, 0, NULL);
2650 }
2651
2652 /*
2653  * Parse and format mempolicy from/to strings
2654  */
2655
2656 /*
2657  * "local" is implemented internally by MPOL_PREFERRED with MPOL_F_LOCAL flag.
2658  */
2659 static const char * const policy_modes[] =
2660 {
2661         [MPOL_DEFAULT]    = "default",
2662         [MPOL_PREFERRED]  = "prefer",
2663         [MPOL_BIND]       = "bind",
2664         [MPOL_INTERLEAVE] = "interleave",
2665         [MPOL_LOCAL]      = "local",
2666 };
2667
2668
2669 #ifdef CONFIG_TMPFS
2670 /**
2671  * mpol_parse_str - parse string to mempolicy, for tmpfs mpol mount option.
2672  * @str:  string containing mempolicy to parse
2673  * @mpol:  pointer to struct mempolicy pointer, returned on success.
2674  *
2675  * Format of input:
2676  *      <mode>[=<flags>][:<nodelist>]
2677  *
2678  * On success, returns 0, else 1
2679  */
2680 int mpol_parse_str(char *str, struct mempolicy **mpol)
2681 {
2682         struct mempolicy *new = NULL;
2683         unsigned short mode;
2684         unsigned short mode_flags;
2685         nodemask_t nodes;
2686         char *nodelist = strchr(str, ':');
2687         char *flags = strchr(str, '=');
2688         int err = 1;
2689
2690         if (nodelist) {
2691                 /* NUL-terminate mode or flags string */
2692                 *nodelist++ = '\0';
2693                 if (nodelist_parse(nodelist, nodes))
2694                         goto out;
2695                 if (!nodes_subset(nodes, node_states[N_MEMORY]))
2696                         goto out;
2697         } else
2698                 nodes_clear(nodes);
2699
2700         if (flags)
2701                 *flags++ = '\0';        /* terminate mode string */
2702
2703         for (mode = 0; mode < MPOL_MAX; mode++) {
2704                 if (!strcmp(str, policy_modes[mode])) {
2705                         break;
2706                 }
2707         }
2708         if (mode >= MPOL_MAX)
2709                 goto out;
2710
2711         switch (mode) {
2712         case MPOL_PREFERRED:
2713                 /*
2714                  * Insist on a nodelist of one node only
2715                  */
2716                 if (nodelist) {
2717                         char *rest = nodelist;
2718                         while (isdigit(*rest))
2719                                 rest++;
2720                         if (*rest)
2721                                 goto out;
2722                 }
2723                 break;
2724         case MPOL_INTERLEAVE:
2725                 /*
2726                  * Default to online nodes with memory if no nodelist
2727                  */
2728                 if (!nodelist)
2729                         nodes = node_states[N_MEMORY];
2730                 break;
2731         case MPOL_LOCAL:
2732                 /*
2733                  * Don't allow a nodelist;  mpol_new() checks flags
2734                  */
2735                 if (nodelist)
2736                         goto out;
2737                 mode = MPOL_PREFERRED;
2738                 break;
2739         case MPOL_DEFAULT:
2740                 /*
2741                  * Insist on a empty nodelist
2742                  */
2743                 if (!nodelist)
2744                         err = 0;
2745                 goto out;
2746         case MPOL_BIND:
2747                 /*
2748                  * Insist on a nodelist
2749                  */
2750                 if (!nodelist)
2751                         goto out;
2752         }
2753
2754         mode_flags = 0;
2755         if (flags) {
2756                 /*
2757                  * Currently, we only support two mutually exclusive
2758                  * mode flags.
2759                  */
2760                 if (!strcmp(flags, "static"))
2761                         mode_flags |= MPOL_F_STATIC_NODES;
2762                 else if (!strcmp(flags, "relative"))
2763                         mode_flags |= MPOL_F_RELATIVE_NODES;
2764                 else
2765                         goto out;
2766         }
2767
2768         new = mpol_new(mode, mode_flags, &nodes);
2769         if (IS_ERR(new))
2770                 goto out;
2771
2772         /*
2773          * Save nodes for mpol_to_str() to show the tmpfs mount options
2774          * for /proc/mounts, /proc/pid/mounts and /proc/pid/mountinfo.
2775          */
2776         if (mode != MPOL_PREFERRED)
2777                 new->v.nodes = nodes;
2778         else if (nodelist)
2779                 new->v.preferred_node = first_node(nodes);
2780         else
2781                 new->flags |= MPOL_F_LOCAL;
2782
2783         /*
2784          * Save nodes for contextualization: this will be used to "clone"
2785          * the mempolicy in a specific context [cpuset] at a later time.
2786          */
2787         new->w.user_nodemask = nodes;
2788
2789         err = 0;
2790
2791 out:
2792         /* Restore string for error message */
2793         if (nodelist)
2794                 *--nodelist = ':';
2795         if (flags)
2796                 *--flags = '=';
2797         if (!err)
2798                 *mpol = new;
2799         return err;
2800 }
2801 #endif /* CONFIG_TMPFS */
2802
2803 /**
2804  * mpol_to_str - format a mempolicy structure for printing
2805  * @buffer:  to contain formatted mempolicy string
2806  * @maxlen:  length of @buffer
2807  * @pol:  pointer to mempolicy to be formatted
2808  *
2809  * Convert @pol into a string.  If @buffer is too short, truncate the string.
2810  * Recommend a @maxlen of at least 32 for the longest mode, "interleave", the
2811  * longest flag, "relative", and to display at least a few node ids.
2812  */
2813 void mpol_to_str(char *buffer, int maxlen, struct mempolicy *pol)
2814 {
2815         char *p = buffer;
2816         nodemask_t nodes = NODE_MASK_NONE;
2817         unsigned short mode = MPOL_DEFAULT;
2818         unsigned short flags = 0;
2819
2820         if (pol && pol != &default_policy && !(pol->flags & MPOL_F_MORON)) {
2821                 mode = pol->mode;
2822                 flags = pol->flags;
2823         }
2824
2825         switch (mode) {
2826         case MPOL_DEFAULT:
2827                 break;
2828         case MPOL_PREFERRED:
2829                 if (flags & MPOL_F_LOCAL)
2830                         mode = MPOL_LOCAL;
2831                 else
2832                         node_set(pol->v.preferred_node, nodes);
2833                 break;
2834         case MPOL_BIND:
2835         case MPOL_INTERLEAVE:
2836                 nodes = pol->v.nodes;
2837                 break;
2838         default:
2839                 WARN_ON_ONCE(1);
2840                 snprintf(p, maxlen, "unknown");
2841                 return;
2842         }
2843
2844         p += snprintf(p, maxlen, "%s", policy_modes[mode]);
2845
2846         if (flags & MPOL_MODE_FLAGS) {
2847                 p += snprintf(p, buffer + maxlen - p, "=");
2848
2849                 /*
2850                  * Currently, the only defined flags are mutually exclusive
2851                  */
2852                 if (flags & MPOL_F_STATIC_NODES)
2853                         p += snprintf(p, buffer + maxlen - p, "static");
2854                 else if (flags & MPOL_F_RELATIVE_NODES)
2855                         p += snprintf(p, buffer + maxlen - p, "relative");
2856         }
2857
2858         if (!nodes_empty(nodes)) {
2859                 p += snprintf(p, buffer + maxlen - p, ":");
2860                 p += nodelist_scnprintf(p, buffer + maxlen - p, nodes);
2861         }
2862 }