mm/mempolicy.c

   1 /*
   2  * Simple NUMA memory policy for the Linux kernel.
   3  *
   4  * Copyright 2003,2004 Andi Kleen, SuSE Labs.
   5  * (C) Copyright 2005 Christoph Lameter, Silicon Graphics, Inc.
   6  * Subject to the GNU Public License, version 2.
   7  *
   8  * NUMA policy allows the user to give hints in which node(s) memory should
   9  * be allocated.
  10  *
  11  * Support four policies per VMA and per process:
  12  *
  13  * The VMA policy has priority over the process policy for a page fault.
  14  *
  15  * interleave     Allocate memory interleaved over a set of nodes,
  16  *                with normal fallback if it fails.
  17  *                For VMA based allocations this interleaves based on the
  18  *                offset into the backing object or offset into the mapping
  19  *                for anonymous memory. For process policy an process counter
  20  *                is used.
  21  *
  22  * bind           Only allocate memory on a specific set of nodes,
  23  *                no fallback.
  24  *                FIXME: memory is allocated starting with the first node
  25  *                to the last. It would be better if bind would truly restrict
  26  *                the allocation to memory nodes instead
  27  *
  28  * preferred       Try a specific node first before normal fallback.
  29  *                As a special case node -1 here means do the allocation
  30  *                on the local CPU. This is normally identical to default,
  31  *                but useful to set in a VMA when you have a non default
  32  *                process policy.
  33  *
  34  * default        Allocate on the local node first, or when on a VMA
  35  *                use the process policy. This is what Linux always did
  36  *                in a NUMA aware kernel and still does by, ahem, default.
  37  *
  38  * The process policy is applied for most non interrupt memory allocations
  39  * in that process' context. Interrupts ignore the policies and always
  40  * try to allocate on the local CPU. The VMA policy is only applied for memory
  41  * allocations for a VMA in the VM.
  42  *
  43  * Currently there are a few corner cases in swapping where the policy
  44  * is not applied, but the majority should be handled. When process policy
  45  * is used it is not remembered over swap outs/swap ins.
  46  *
  47  * Only the highest zone in the zone hierarchy gets policied. Allocations
  48  * requesting a lower zone just use default policy. This implies that
  49  * on systems with highmem kernel lowmem allocation don't get policied.
  50  * Same with GFP_DMA allocations.
  51  *
  52  * For shmfs/tmpfs/hugetlbfs shared memory the policy is shared between
  53  * all users and remembered even when nobody has memory mapped.
  54  */
  55
  56 /* Notebook:
  57    fix mmap readahead to honour policy and enable policy for any page cache
  58    object
  59    statistics for bigpages
  60    global policy for page cache? currently it uses process policy. Requires
  61    first item above.
  62    handle mremap for shared memory (currently ignored for the policy)
  63    grows down?
  64    make bind policy root only? It can trigger oom much faster and the
  65    kernel is not always grateful with that.
  66    could replace all the switch()es with a mempolicy_ops structure.
  67 */
  68
  69 #include <linux/mempolicy.h>
  70 #include <linux/mm.h>
  71 #include <linux/highmem.h>
  72 #include <linux/hugetlb.h>
  73 #include <linux/kernel.h>
  74 #include <linux/sched.h>
  75 #include <linux/nodemask.h>
  76 #include <linux/cpuset.h>
  77 #include <linux/gfp.h>
  78 #include <linux/slab.h>
  79 #include <linux/string.h>
  80 #include <linux/module.h>
  81 #include <linux/nsproxy.h>
  82 #include <linux/interrupt.h>
  83 #include <linux/init.h>
  84 #include <linux/compat.h>
  85 #include <linux/swap.h>
  86 #include <linux/seq_file.h>
  87 #include <linux/proc_fs.h>
  88 #include <linux/migrate.h>
  89 #include <linux/rmap.h>
  90 #include <linux/security.h>
  91 #include <linux/syscalls.h>
  92
  93 #include <asm/tlbflush.h>
  94 #include <asm/uaccess.h>
  95
  96 /* Internal flags */
  97 #define MPOL_MF_DISCONTIG_OK (MPOL_MF_INTERNAL << 0)    /* Skip checks for continuous vmas */
  98 #define MPOL_MF_INVERT (MPOL_MF_INTERNAL << 1)          /* Invert check for nodemask */
  99 #define MPOL_MF_STATS (MPOL_MF_INTERNAL << 2)           /* Gather statistics */
 100
 101 static struct kmem_cache *policy_cache;
 102 static struct kmem_cache *sn_cache;
 103
 104 /* Highest zone. An specific allocation for a zone below that is not
 105    policied. */
 106 enum zone_type policy_zone = 0;
 107
 108 struct mempolicy default_policy = {
 109         .refcnt = ATOMIC_INIT(1), /* never free it */
 110         .policy = MPOL_DEFAULT,
 111 };
 112
 113 static void mpol_rebind_policy(struct mempolicy *pol,
 114                                const nodemask_t *newmask);
 115
 116 /* Do sanity checking on a policy */
 117 static int mpol_check_policy(int mode, nodemask_t *nodes)
 118 {
 119         int was_empty, is_empty;
 120
 121         if (!nodes)
 122                 return 0;
 123
 124         /*
 125          * "Contextualize" the in-coming nodemast for cpusets:
 126          * Remember whether in-coming nodemask was empty,  If not,
 127          * restrict the nodes to the allowed nodes in the cpuset.
 128          * This is guaranteed to be a subset of nodes with memory.
 129          */
 130         cpuset_update_task_memory_state();
 131         is_empty = was_empty = nodes_empty(*nodes);
 132         if (!was_empty) {
 133                 nodes_and(*nodes, *nodes, cpuset_current_mems_allowed);
 134                 is_empty = nodes_empty(*nodes); /* after "contextualization" */
 135         }
 136
 137         switch (mode) {
 138         case MPOL_DEFAULT:
 139                 /*
 140                  * require caller to specify an empty nodemask
 141                  * before "contextualization"
 142                  */
 143                 if (!was_empty)
 144                         return -EINVAL;
 145                 break;
 146         case MPOL_BIND:
 147         case MPOL_INTERLEAVE:
 148                 /*
 149                  * require at least 1 valid node after "contextualization"
 150                  */
 151                 if (is_empty)
 152                         return -EINVAL;
 153                 break;
 154         case MPOL_PREFERRED:
 155                 /*
 156                  * Did caller specify invalid nodes?
 157                  * Don't silently accept this as "local allocation".
 158                  */
 159                 if (!was_empty && is_empty)
 160                         return -EINVAL;
 161                 break;
 162         }
 163         return 0;
 164 }
 165
 166 /* Generate a custom zonelist for the BIND policy. */
 167 static struct zonelist *bind_zonelist(nodemask_t *nodes)
 168 {
 169         struct zonelist *zl;
 170         int num, max, nd;
 171         enum zone_type k;
 172
 173         max = 1 + MAX_NR_ZONES * nodes_weight(*nodes);
 174         max++;                  /* space for zlcache_ptr (see mmzone.h) */
 175         zl = kmalloc(sizeof(struct zone *) * max, GFP_KERNEL);
 176         if (!zl)
 177                 return ERR_PTR(-ENOMEM);
 178         zl->zlcache_ptr = NULL;
 179         num = 0;
 180         /* First put in the highest zones from all nodes, then all the next
 181            lower zones etc. Avoid empty zones because the memory allocator
 182            doesn't like them. If you implement node hot removal you
 183            have to fix that. */
 184         k = MAX_NR_ZONES - 1;
 185         while (1) {
 186                 for_each_node_mask(nd, *nodes) {
 187                         struct zone *z = &NODE_DATA(nd)->node_zones[k];
 188                         if (z->present_pages > 0)
 189                                 zl->zones[num++] = z;
 190                 }
 191                 if (k == 0)
 192                         break;
 193                 k--;
 194         }
 195         if (num == 0) {
 196                 kfree(zl);
 197                 return ERR_PTR(-EINVAL);
 198         }
 199         zl->zones[num] = NULL;
 200         return zl;
 201 }
 202
 203 /* Create a new policy */
 204 static struct mempolicy *mpol_new(int mode, nodemask_t *nodes)
 205 {
 206         struct mempolicy *policy;
 207
 208         pr_debug("setting mode %d nodes[0] %lx\n",
 209                  mode, nodes ? nodes_addr(*nodes)[0] : -1);
 210
 211         if (mode == MPOL_DEFAULT)
 212                 return NULL;
 213         policy = kmem_cache_alloc(policy_cache, GFP_KERNEL);
 214         if (!policy)
 215                 return ERR_PTR(-ENOMEM);
 216         atomic_set(&policy->refcnt, 1);
 217         switch (mode) {
 218         case MPOL_INTERLEAVE:
 219                 policy->v.nodes = *nodes;
 220                 if (nodes_weight(policy->v.nodes) == 0) {
 221                         kmem_cache_free(policy_cache, policy);
 222                         return ERR_PTR(-EINVAL);
 223                 }
 224                 break;
 225         case MPOL_PREFERRED:
 226                 policy->v.preferred_node = first_node(*nodes);
 227                 if (policy->v.preferred_node >= MAX_NUMNODES)
 228                         policy->v.preferred_node = -1;
 229                 break;
 230         case MPOL_BIND:
 231                 policy->v.zonelist = bind_zonelist(nodes);
 232                 if (IS_ERR(policy->v.zonelist)) {
 233                         void *error_code = policy->v.zonelist;
 234                         kmem_cache_free(policy_cache, policy);
 235                         return error_code;
 236                 }
 237                 break;
 238         }
 239         policy->policy = mode;
 240         policy->cpuset_mems_allowed = cpuset_mems_allowed(current);
 241         return policy;
 242 }
 243
 244 static void gather_stats(struct page *, void *, int pte_dirty);
 245 static void migrate_page_add(struct page *page, struct list_head *pagelist,
 246                                 unsigned long flags);
 247
 248 /* Scan through pages checking if pages follow certain conditions. */
 249 static int check_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
 250                 unsigned long addr, unsigned long end,
 251                 const nodemask_t *nodes, unsigned long flags,
 252                 void *private)
 253 {
 254         pte_t *orig_pte;
 255         pte_t *pte;
 256         spinlock_t *ptl;
 257
 258         orig_pte = pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
 259         do {
 260                 struct page *page;
 261                 int nid;
 262
 263                 if (!pte_present(*pte))
 264                         continue;
 265                 page = vm_normal_page(vma, addr, *pte);
 266                 if (!page)
 267                         continue;
 268                 /*
 269                  * The check for PageReserved here is important to avoid
 270                  * handling zero pages and other pages that may have been
 271                  * marked special by the system.
 272                  *
 273                  * If the PageReserved would not be checked here then f.e.
 274                  * the location of the zero page could have an influence
 275                  * on MPOL_MF_STRICT, zero pages would be counted for
 276                  * the per node stats, and there would be useless attempts
 277                  * to put zero pages on the migration list.
 278                  */
 279                 if (PageReserved(page))
 280                         continue;
 281                 nid = page_to_nid(page);
 282                 if (node_isset(nid, *nodes) == !!(flags & MPOL_MF_INVERT))
 283                         continue;
 284
 285                 if (flags & MPOL_MF_STATS)
 286                         gather_stats(page, private, pte_dirty(*pte));
 287                 else if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL))
 288                         migrate_page_add(page, private, flags);
 289                 else
 290                         break;
 291         } while (pte++, addr += PAGE_SIZE, addr != end);
 292         pte_unmap_unlock(orig_pte, ptl);
 293         return addr != end;
 294 }
 295
 296 static inline int check_pmd_range(struct vm_area_struct *vma, pud_t *pud,
 297                 unsigned long addr, unsigned long end,
 298                 const nodemask_t *nodes, unsigned long flags,
 299                 void *private)
 300 {
 301         pmd_t *pmd;
 302         unsigned long next;
 303
 304         pmd = pmd_offset(pud, addr);
 305         do {
 306                 next = pmd_addr_end(addr, end);
 307                 if (pmd_none_or_clear_bad(pmd))
 308                         continue;
 309                 if (check_pte_range(vma, pmd, addr, next, nodes,
 310                                     flags, private))
 311                         return -EIO;
 312         } while (pmd++, addr = next, addr != end);
 313         return 0;
 314 }
 315
 316 static inline int check_pud_range(struct vm_area_struct *vma, pgd_t *pgd,
 317                 unsigned long addr, unsigned long end,
 318                 const nodemask_t *nodes, unsigned long flags,
 319                 void *private)
 320 {
 321         pud_t *pud;
 322         unsigned long next;
 323
 324         pud = pud_offset(pgd, addr);
 325         do {
 326                 next = pud_addr_end(addr, end);
 327                 if (pud_none_or_clear_bad(pud))
 328                         continue;
 329                 if (check_pmd_range(vma, pud, addr, next, nodes,
 330                                     flags, private))
 331                         return -EIO;
 332         } while (pud++, addr = next, addr != end);
 333         return 0;
 334 }
 335
 336 static inline int check_pgd_range(struct vm_area_struct *vma,
 337                 unsigned long addr, unsigned long end,
 338                 const nodemask_t *nodes, unsigned long flags,
 339                 void *private)
 340 {
 341         pgd_t *pgd;
 342         unsigned long next;
 343
 344         pgd = pgd_offset(vma->vm_mm, addr);
 345         do {
 346                 next = pgd_addr_end(addr, end);
 347                 if (pgd_none_or_clear_bad(pgd))
 348                         continue;
 349                 if (check_pud_range(vma, pgd, addr, next, nodes,
 350                                     flags, private))
 351                         return -EIO;
 352         } while (pgd++, addr = next, addr != end);
 353         return 0;
 354 }
 355
 356 /*
 357  * Check if all pages in a range are on a set of nodes.
 358  * If pagelist != NULL then isolate pages from the LRU and
 359  * put them on the pagelist.
 360  */
 361 static struct vm_area_struct *
 362 check_range(struct mm_struct *mm, unsigned long start, unsigned long end,
 363                 const nodemask_t *nodes, unsigned long flags, void *private)
 364 {
 365         int err;
 366         struct vm_area_struct *first, *vma, *prev;
 367
 368         if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) {
 369
 370                 err = migrate_prep();
 371                 if (err)
 372                         return ERR_PTR(err);
 373         }
 374
 375         first = find_vma(mm, start);
 376         if (!first)
 377                 return ERR_PTR(-EFAULT);
 378         prev = NULL;
 379         for (vma = first; vma && vma->vm_start < end; vma = vma->vm_next) {
 380                 if (!(flags & MPOL_MF_DISCONTIG_OK)) {
 381                         if (!vma->vm_next && vma->vm_end < end)
 382                                 return ERR_PTR(-EFAULT);
 383                         if (prev && prev->vm_end < vma->vm_start)
 384                                 return ERR_PTR(-EFAULT);
 385                 }
 386                 if (!is_vm_hugetlb_page(vma) &&
 387                     ((flags & MPOL_MF_STRICT) ||
 388                      ((flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) &&
 389                                 vma_migratable(vma)))) {
 390                         unsigned long endvma = vma->vm_end;
 391
 392                         if (endvma > end)
 393                                 endvma = end;
 394                         if (vma->vm_start > start)
 395                                 start = vma->vm_start;
 396                         err = check_pgd_range(vma, start, endvma, nodes,
 397                                                 flags, private);
 398                         if (err) {
 399                                 first = ERR_PTR(err);
 400                                 break;
 401                         }
 402                 }
 403                 prev = vma;
 404         }
 405         return first;
 406 }
 407
 408 /* Apply policy to a single VMA */
 409 static int policy_vma(struct vm_area_struct *vma, struct mempolicy *new)
 410 {
 411         int err = 0;
 412         struct mempolicy *old = vma->vm_policy;
 413
 414         pr_debug("vma %lx-%lx/%lx vm_ops %p vm_file %p set_policy %p\n",
 415                  vma->vm_start, vma->vm_end, vma->vm_pgoff,
 416                  vma->vm_ops, vma->vm_file,
 417                  vma->vm_ops ? vma->vm_ops->set_policy : NULL);
 418
 419         if (vma->vm_ops && vma->vm_ops->set_policy)
 420                 err = vma->vm_ops->set_policy(vma, new);
 421         if (!err) {
 422                 mpol_get(new);
 423                 vma->vm_policy = new;
 424                 mpol_free(old);
 425         }
 426         return err;
 427 }
 428
 429 /* Step 2: apply policy to a range and do splits. */
 430 static int mbind_range(struct vm_area_struct *vma, unsigned long start,
 431                        unsigned long end, struct mempolicy *new)
 432 {
 433         struct vm_area_struct *next;
 434         int err;
 435
 436         err = 0;
 437         for (; vma && vma->vm_start < end; vma = next) {
 438                 next = vma->vm_next;
 439                 if (vma->vm_start < start)
 440                         err = split_vma(vma->vm_mm, vma, start, 1);
 441                 if (!err && vma->vm_end > end)
 442                         err = split_vma(vma->vm_mm, vma, end, 0);
 443                 if (!err)
 444                         err = policy_vma(vma, new);
 445                 if (err)
 446                         break;
 447         }
 448         return err;
 449 }
 450
 451 /*
 452  * Update task->flags PF_MEMPOLICY bit: set iff non-default
 453  * mempolicy.  Allows more rapid checking of this (combined perhaps
 454  * with other PF_* flag bits) on memory allocation hot code paths.
 455  *
 456  * If called from outside this file, the task 'p' should -only- be
 457  * a newly forked child not yet visible on the task list, because
 458  * manipulating the task flags of a visible task is not safe.
 459  *
 460  * The above limitation is why this routine has the funny name
 461  * mpol_fix_fork_child_flag().
 462  *
 463  * It is also safe to call this with a task pointer of current,
 464  * which the static wrapper mpol_set_task_struct_flag() does,
 465  * for use within this file.
 466  */
 467
 468 void mpol_fix_fork_child_flag(struct task_struct *p)
 469 {
 470         if (p->mempolicy)
 471                 p->flags |= PF_MEMPOLICY;
 472         else
 473                 p->flags &= ~PF_MEMPOLICY;
 474 }
 475
 476 static void mpol_set_task_struct_flag(void)
 477 {
 478         mpol_fix_fork_child_flag(current);
 479 }
 480
 481 /* Set the process memory policy */
 482 static long do_set_mempolicy(int mode, nodemask_t *nodes)
 483 {
 484         struct mempolicy *new;
 485
 486         if (mpol_check_policy(mode, nodes))
 487                 return -EINVAL;
 488         new = mpol_new(mode, nodes);
 489         if (IS_ERR(new))
 490                 return PTR_ERR(new);
 491         mpol_free(current->mempolicy);
 492         current->mempolicy = new;
 493         mpol_set_task_struct_flag();
 494         if (new && new->policy == MPOL_INTERLEAVE)
 495                 current->il_next = first_node(new->v.nodes);
 496         return 0;
 497 }
 498
 499 /* Fill a zone bitmap for a policy */
 500 static void get_zonemask(struct mempolicy *p, nodemask_t *nodes)
 501 {
 502         int i;
 503
 504         nodes_clear(*nodes);
 505         switch (p->policy) {
 506         case MPOL_BIND:
 507                 for (i = 0; p->v.zonelist->zones[i]; i++)
 508                         node_set(zone_to_nid(p->v.zonelist->zones[i]),
 509                                 *nodes);
 510                 break;
 511         case MPOL_DEFAULT:
 512                 break;
 513         case MPOL_INTERLEAVE:
 514                 *nodes = p->v.nodes;
 515                 break;
 516         case MPOL_PREFERRED:
 517                 /* or use current node instead of memory_map? */
 518                 if (p->v.preferred_node < 0)
 519                         *nodes = node_states[N_HIGH_MEMORY];
 520                 else
 521                         node_set(p->v.preferred_node, *nodes);
 522                 break;
 523         default:
 524                 BUG();
 525         }
 526 }
 527
 528 static int lookup_node(struct mm_struct *mm, unsigned long addr)
 529 {
 530         struct page *p;
 531         int err;
 532
 533         err = get_user_pages(current, mm, addr & PAGE_MASK, 1, 0, 0, &p, NULL);
 534         if (err >= 0) {
 535                 err = page_to_nid(p);
 536                 put_page(p);
 537         }
 538         return err;
 539 }
 540
 541 /* Retrieve NUMA policy */
 542 static long do_get_mempolicy(int *policy, nodemask_t *nmask,
 543                              unsigned long addr, unsigned long flags)
 544 {
 545         int err;
 546         struct mm_struct *mm = current->mm;
 547         struct vm_area_struct *vma = NULL;
 548         struct mempolicy *pol = current->mempolicy;
 549
 550         cpuset_update_task_memory_state();
 551         if (flags &
 552                 ~(unsigned long)(MPOL_F_NODE|MPOL_F_ADDR|MPOL_F_MEMS_ALLOWED))
 553                 return -EINVAL;
 554
 555         if (flags & MPOL_F_MEMS_ALLOWED) {
 556                 if (flags & (MPOL_F_NODE|MPOL_F_ADDR))
 557                         return -EINVAL;
 558                 *policy = 0;    /* just so it's initialized */
 559                 *nmask  = cpuset_current_mems_allowed;
 560                 return 0;
 561         }
 562
 563         if (flags & MPOL_F_ADDR) {
 564                 down_read(&mm->mmap_sem);
 565                 vma = find_vma_intersection(mm, addr, addr+1);
 566                 if (!vma) {
 567                         up_read(&mm->mmap_sem);
 568                         return -EFAULT;
 569                 }
 570                 if (vma->vm_ops && vma->vm_ops->get_policy)
 571                         pol = vma->vm_ops->get_policy(vma, addr);
 572                 else
 573                         pol = vma->vm_policy;
 574         } else if (addr)
 575                 return -EINVAL;
 576
 577         if (!pol)
 578                 pol = &default_policy;
 579
 580         if (flags & MPOL_F_NODE) {
 581                 if (flags & MPOL_F_ADDR) {
 582                         err = lookup_node(mm, addr);
 583                         if (err < 0)
 584                                 goto out;
 585                         *policy = err;
 586                 } else if (pol == current->mempolicy &&
 587                                 pol->policy == MPOL_INTERLEAVE) {
 588                         *policy = current->il_next;
 589                 } else {
 590                         err = -EINVAL;
 591                         goto out;
 592                 }
 593         } else
 594                 *policy = pol->policy;
 595
 596         if (vma) {
 597                 up_read(&current->mm->mmap_sem);
 598                 vma = NULL;
 599         }
 600
 601         err = 0;
 602         if (nmask)
 603                 get_zonemask(pol, nmask);
 604
 605  out:
 606         if (vma)
 607                 up_read(&current->mm->mmap_sem);
 608         return err;
 609 }
 610
 611 #ifdef CONFIG_MIGRATION
 612 /*
 613  * page migration
 614  */
 615 static void migrate_page_add(struct page *page, struct list_head *pagelist,
 616                                 unsigned long flags)
 617 {
 618         /*
 619          * Avoid migrating a page that is shared with others.
 620          */
 621         if ((flags & MPOL_MF_MOVE_ALL) || page_mapcount(page) == 1)
 622                 isolate_lru_page(page, pagelist);
 623 }
 624
 625 static struct page *new_node_page(struct page *page, unsigned long node, int **x)
 626 {
 627         return alloc_pages_node(node, GFP_HIGHUSER_MOVABLE, 0);
 628 }
 629
 630 /*
 631  * Migrate pages from one node to a target node.
 632  * Returns error or the number of pages not migrated.
 633  */
 634 static int migrate_to_node(struct mm_struct *mm, int source, int dest,
 635                            int flags)
 636 {
 637         nodemask_t nmask;
 638         LIST_HEAD(pagelist);
 639         int err = 0;
 640
 641         nodes_clear(nmask);
 642         node_set(source, nmask);
 643
 644         check_range(mm, mm->mmap->vm_start, TASK_SIZE, &nmask,
 645                         flags | MPOL_MF_DISCONTIG_OK, &pagelist);
 646
 647         if (!list_empty(&pagelist))
 648                 err = migrate_pages(&pagelist, new_node_page, dest);
 649
 650         return err;
 651 }
 652
 653 /*
 654  * Move pages between the two nodesets so as to preserve the physical
 655  * layout as much as possible.
 656  *
 657  * Returns the number of page that could not be moved.
 658  */
 659 int do_migrate_pages(struct mm_struct *mm,
 660         const nodemask_t *from_nodes, const nodemask_t *to_nodes, int flags)
 661 {
 662         LIST_HEAD(pagelist);
 663         int busy = 0;
 664         int err = 0;
 665         nodemask_t tmp;
 666
 667         down_read(&mm->mmap_sem);
 668
 669         err = migrate_vmas(mm, from_nodes, to_nodes, flags);
 670         if (err)
 671                 goto out;
 672
 673 /*
 674  * Find a 'source' bit set in 'tmp' whose corresponding 'dest'
 675  * bit in 'to' is not also set in 'tmp'.  Clear the found 'source'
 676  * bit in 'tmp', and return that <source, dest> pair for migration.
 677  * The pair of nodemasks 'to' and 'from' define the map.
 678  *
 679  * If no pair of bits is found that way, fallback to picking some
 680  * pair of 'source' and 'dest' bits that are not the same.  If the
 681  * 'source' and 'dest' bits are the same, this represents a node
 682  * that will be migrating to itself, so no pages need move.
 683  *
 684  * If no bits are left in 'tmp', or if all remaining bits left
 685  * in 'tmp' correspond to the same bit in 'to', return false
 686  * (nothing left to migrate).
 687  *
 688  * This lets us pick a pair of nodes to migrate between, such that
 689  * if possible the dest node is not already occupied by some other
 690  * source node, minimizing the risk of overloading the memory on a
 691  * node that would happen if we migrated incoming memory to a node
 692  * before migrating outgoing memory source that same node.
 693  *
 694  * A single scan of tmp is sufficient.  As we go, we remember the
 695  * most recent <s, d> pair that moved (s != d).  If we find a pair
 696  * that not only moved, but what's better, moved to an empty slot
 697  * (d is not set in tmp), then we break out then, with that pair.
 698  * Otherwise when we finish scannng from_tmp, we at least have the
 699  * most recent <s, d> pair that moved.  If we get all the way through
 700  * the scan of tmp without finding any node that moved, much less
 701  * moved to an empty node, then there is nothing left worth migrating.
 702  */
 703
 704         tmp = *from_nodes;
 705         while (!nodes_empty(tmp)) {
 706                 int s,d;
 707                 int source = -1;
 708                 int dest = 0;
 709
 710                 for_each_node_mask(s, tmp) {
 711                         d = node_remap(s, *from_nodes, *to_nodes);
 712                         if (s == d)
 713                                 continue;
 714
 715                         source = s;     /* Node moved. Memorize */
 716                         dest = d;
 717
 718                         /* dest not in remaining from nodes? */
 719                         if (!node_isset(dest, tmp))
 720                                 break;
 721                 }
 722                 if (source == -1)
 723                         break;
 724
 725                 node_clear(source, tmp);
 726                 err = migrate_to_node(mm, source, dest, flags);
 727                 if (err > 0)
 728                         busy += err;
 729                 if (err < 0)
 730                         break;
 731         }
 732 out:
 733         up_read(&mm->mmap_sem);
 734         if (err < 0)
 735                 return err;
 736         return busy;
 737
 738 }
 739
 740 /*
 741  * Allocate a new page for page migration based on vma policy.
 742  * Start assuming that page is mapped by vma pointed to by @private.
 743  * Search forward from there, if not.  N.B., this assumes that the
 744  * list of pages handed to migrate_pages()--which is how we get here--
 745  * is in virtual address order.
 746  */
 747 static struct page *new_vma_page(struct page *page, unsigned long private, int **x)
 748 {
 749         struct vm_area_struct *vma = (struct vm_area_struct *)private;
 750         unsigned long uninitialized_var(address);
 751
 752         while (vma) {
 753                 address = page_address_in_vma(page, vma);
 754                 if (address != -EFAULT)
 755                         break;
 756                 vma = vma->vm_next;
 757         }
 758
 759         /*
 760          * if !vma, alloc_page_vma() will use task or system default policy
 761          */
 762         return alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, address);
 763 }
 764 #else
 765
 766 static void migrate_page_add(struct page *page, struct list_head *pagelist,
 767                                 unsigned long flags)
 768 {
 769 }
 770
 771 int do_migrate_pages(struct mm_struct *mm,
 772         const nodemask_t *from_nodes, const nodemask_t *to_nodes, int flags)
 773 {
 774         return -ENOSYS;
 775 }
 776
 777 static struct page *new_vma_page(struct page *page, unsigned long private, int **x)
 778 {
 779         return NULL;
 780 }
 781 #endif
 782
 783 static long do_mbind(unsigned long start, unsigned long len,
 784                      unsigned long mode, nodemask_t *nmask,
 785                      unsigned long flags)
 786 {
 787         struct vm_area_struct *vma;
 788         struct mm_struct *mm = current->mm;
 789         struct mempolicy *new;
 790         unsigned long end;
 791         int err;
 792         LIST_HEAD(pagelist);
 793
 794         if ((flags & ~(unsigned long)(MPOL_MF_STRICT |
 795                                       MPOL_MF_MOVE | MPOL_MF_MOVE_ALL))
 796             || mode > MPOL_MAX)
 797                 return -EINVAL;
 798         if ((flags & MPOL_MF_MOVE_ALL) && !capable(CAP_SYS_NICE))
 799                 return -EPERM;
 800
 801         if (start & ~PAGE_MASK)
 802                 return -EINVAL;
 803
 804         if (mode == MPOL_DEFAULT)
 805                 flags &= ~MPOL_MF_STRICT;
 806
 807         len = (len + PAGE_SIZE - 1) & PAGE_MASK;
 808         end = start + len;
 809
 810         if (end < start)
 811                 return -EINVAL;
 812         if (end == start)
 813                 return 0;
 814
 815         if (mpol_check_policy(mode, nmask))
 816                 return -EINVAL;
 817
 818         new = mpol_new(mode, nmask);
 819         if (IS_ERR(new))
 820                 return PTR_ERR(new);
 821
 822         /*
 823          * If we are using the default policy then operation
 824          * on discontinuous address spaces is okay after all
 825          */
 826         if (!new)
 827                 flags |= MPOL_MF_DISCONTIG_OK;
 828
 829         pr_debug("mbind %lx-%lx mode:%ld nodes:%lx\n",start,start+len,
 830                  mode, nmask ? nodes_addr(*nmask)[0] : -1);
 831
 832         down_write(&mm->mmap_sem);
 833         vma = check_range(mm, start, end, nmask,
 834                           flags | MPOL_MF_INVERT, &pagelist);
 835
 836         err = PTR_ERR(vma);
 837         if (!IS_ERR(vma)) {
 838                 int nr_failed = 0;
 839
 840                 err = mbind_range(vma, start, end, new);
 841
 842                 if (!list_empty(&pagelist))
 843                         nr_failed = migrate_pages(&pagelist, new_vma_page,
 844                                                 (unsigned long)vma);
 845
 846                 if (!err && nr_failed && (flags & MPOL_MF_STRICT))
 847                         err = -EIO;
 848         }
 849
 850         up_write(&mm->mmap_sem);
 851         mpol_free(new);
 852         return err;
 853 }
 854
 855 /*
 856  * User space interface with variable sized bitmaps for nodelists.
 857  */
 858
 859 /* Copy a node mask from user space. */
 860 static int get_nodes(nodemask_t *nodes, const unsigned long __user *nmask,
 861                      unsigned long maxnode)
 862 {
 863         unsigned long k;
 864         unsigned long nlongs;
 865         unsigned long endmask;
 866
 867         --maxnode;
 868         nodes_clear(*nodes);
 869         if (maxnode == 0 || !nmask)
 870                 return 0;
 871         if (maxnode > PAGE_SIZE*BITS_PER_BYTE)
 872                 return -EINVAL;
 873
 874         nlongs = BITS_TO_LONGS(maxnode);
 875         if ((maxnode % BITS_PER_LONG) == 0)
 876                 endmask = ~0UL;
 877         else
 878                 endmask = (1UL << (maxnode % BITS_PER_LONG)) - 1;
 879
 880         /* When the user specified more nodes than supported just check
 881            if the non supported part is all zero. */
 882         if (nlongs > BITS_TO_LONGS(MAX_NUMNODES)) {
 883                 if (nlongs > PAGE_SIZE/sizeof(long))
 884                         return -EINVAL;
 885                 for (k = BITS_TO_LONGS(MAX_NUMNODES); k < nlongs; k++) {
 886                         unsigned long t;
 887                         if (get_user(t, nmask + k))
 888                                 return -EFAULT;
 889                         if (k == nlongs - 1) {
 890                                 if (t & endmask)
 891                                         return -EINVAL;
 892                         } else if (t)
 893                                 return -EINVAL;
 894                 }
 895                 nlongs = BITS_TO_LONGS(MAX_NUMNODES);
 896                 endmask = ~0UL;
 897         }
 898
 899         if (copy_from_user(nodes_addr(*nodes), nmask, nlongs*sizeof(unsigned long)))
 900                 return -EFAULT;
 901         nodes_addr(*nodes)[nlongs-1] &= endmask;
 902         return 0;
 903 }
 904
 905 /* Copy a kernel node mask to user space */
 906 static int copy_nodes_to_user(unsigned long __user *mask, unsigned long maxnode,
 907                               nodemask_t *nodes)
 908 {
 909         unsigned long copy = ALIGN(maxnode-1, 64) / 8;
 910         const int nbytes = BITS_TO_LONGS(MAX_NUMNODES) * sizeof(long);
 911
 912         if (copy > nbytes) {
 913                 if (copy > PAGE_SIZE)
 914                         return -EINVAL;
 915                 if (clear_user((char __user *)mask + nbytes, copy - nbytes))
 916                         return -EFAULT;
 917                 copy = nbytes;
 918         }
 919         return copy_to_user(mask, nodes_addr(*nodes), copy) ? -EFAULT : 0;
 920 }
 921
 922 asmlinkage long sys_mbind(unsigned long start, unsigned long len,
 923                         unsigned long mode,
 924                         unsigned long __user *nmask, unsigned long maxnode,
 925                         unsigned flags)
 926 {
 927         nodemask_t nodes;
 928         int err;
 929
 930         err = get_nodes(&nodes, nmask, maxnode);
 931         if (err)
 932                 return err;
 933         return do_mbind(start, len, mode, &nodes, flags);
 934 }
 935
 936 /* Set the process memory policy */
 937 asmlinkage long sys_set_mempolicy(int mode, unsigned long __user *nmask,
 938                 unsigned long maxnode)
 939 {
 940         int err;
 941         nodemask_t nodes;
 942
 943         if (mode < 0 || mode > MPOL_MAX)
 944                 return -EINVAL;
 945         err = get_nodes(&nodes, nmask, maxnode);
 946         if (err)
 947                 return err;
 948         return do_set_mempolicy(mode, &nodes);
 949 }
 950
 951 asmlinkage long sys_migrate_pages(pid_t pid, unsigned long maxnode,
 952                 const unsigned long __user *old_nodes,
 953                 const unsigned long __user *new_nodes)
 954 {
 955         struct mm_struct *mm;
 956         struct task_struct *task;
 957         nodemask_t old;
 958         nodemask_t new;
 959         nodemask_t task_nodes;
 960         int err;
 961
 962         err = get_nodes(&old, old_nodes, maxnode);
 963         if (err)
 964                 return err;
 965
 966         err = get_nodes(&new, new_nodes, maxnode);
 967         if (err)
 968                 return err;
 969
 970         /* Find the mm_struct */
 971         read_lock(&tasklist_lock);
 972         task = pid ? find_task_by_vpid(pid) : current;
 973         if (!task) {
 974                 read_unlock(&tasklist_lock);
 975                 return -ESRCH;
 976         }
 977         mm = get_task_mm(task);
 978         read_unlock(&tasklist_lock);
 979
 980         if (!mm)
 981                 return -EINVAL;
 982
 983         /*
 984          * Check if this process has the right to modify the specified
 985          * process. The right exists if the process has administrative
 986          * capabilities, superuser privileges or the same
 987          * userid as the target process.
 988          */
 989         if ((current->euid != task->suid) && (current->euid != task->uid) &&
 990             (current->uid != task->suid) && (current->uid != task->uid) &&
 991             !capable(CAP_SYS_NICE)) {
 992                 err = -EPERM;
 993                 goto out;
 994         }
 995
 996         task_nodes = cpuset_mems_allowed(task);
 997         /* Is the user allowed to access the target nodes? */
 998         if (!nodes_subset(new, task_nodes) && !capable(CAP_SYS_NICE)) {
 999                 err = -EPERM;
1000                 goto out;
1001         }
1002
1003         if (!nodes_subset(new, node_states[N_HIGH_MEMORY])) {
1004                 err = -EINVAL;
1005                 goto out;
1006         }
1007
1008         err = security_task_movememory(task);
1009         if (err)
1010                 goto out;
1011
1012         err = do_migrate_pages(mm, &old, &new,
1013                 capable(CAP_SYS_NICE) ? MPOL_MF_MOVE_ALL : MPOL_MF_MOVE);
1014 out:
1015         mmput(mm);
1016         return err;
1017 }
1018
1019
1020 /* Retrieve NUMA policy */
1021 asmlinkage long sys_get_mempolicy(int __user *policy,
1022                                 unsigned long __user *nmask,
1023                                 unsigned long maxnode,
1024                                 unsigned long addr, unsigned long flags)
1025 {
1026         int err;
1027         int uninitialized_var(pval);
1028         nodemask_t nodes;
1029
1030         if (nmask != NULL && maxnode < MAX_NUMNODES)
1031                 return -EINVAL;
1032
1033         err = do_get_mempolicy(&pval, &nodes, addr, flags);
1034
1035         if (err)
1036                 return err;
1037
1038         if (policy && put_user(pval, policy))
1039                 return -EFAULT;
1040
1041         if (nmask)
1042                 err = copy_nodes_to_user(nmask, maxnode, &nodes);
1043
1044         return err;
1045 }
1046
1047 #ifdef CONFIG_COMPAT
1048
1049 asmlinkage long compat_sys_get_mempolicy(int __user *policy,
1050                                      compat_ulong_t __user *nmask,
1051                                      compat_ulong_t maxnode,
1052                                      compat_ulong_t addr, compat_ulong_t flags)
1053 {
1054         long err;
1055         unsigned long __user *nm = NULL;
1056         unsigned long nr_bits, alloc_size;
1057         DECLARE_BITMAP(bm, MAX_NUMNODES);
1058
1059         nr_bits = min_t(unsigned long, maxnode-1, MAX_NUMNODES);
1060         alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8;
1061
1062         if (nmask)
1063                 nm = compat_alloc_user_space(alloc_size);
1064
1065         err = sys_get_mempolicy(policy, nm, nr_bits+1, addr, flags);
1066
1067         if (!err && nmask) {
1068                 err = copy_from_user(bm, nm, alloc_size);
1069                 /* ensure entire bitmap is zeroed */
1070                 err |= clear_user(nmask, ALIGN(maxnode-1, 8) / 8);
1071                 err |= compat_put_bitmap(nmask, bm, nr_bits);
1072         }
1073
1074         return err;
1075 }
1076
1077 asmlinkage long compat_sys_set_mempolicy(int mode, compat_ulong_t __user *nmask,
1078                                      compat_ulong_t maxnode)
1079 {
1080         long err = 0;
1081         unsigned long __user *nm = NULL;
1082         unsigned long nr_bits, alloc_size;
1083         DECLARE_BITMAP(bm, MAX_NUMNODES);
1084
1085         nr_bits = min_t(unsigned long, maxnode-1, MAX_NUMNODES);
1086         alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8;
1087
1088         if (nmask) {
1089                 err = compat_get_bitmap(bm, nmask, nr_bits);
1090                 nm = compat_alloc_user_space(alloc_size);
1091                 err |= copy_to_user(nm, bm, alloc_size);
1092         }
1093
1094         if (err)
1095                 return -EFAULT;
1096
1097         return sys_set_mempolicy(mode, nm, nr_bits+1);
1098 }
1099
1100 asmlinkage long compat_sys_mbind(compat_ulong_t start, compat_ulong_t len,
1101                              compat_ulong_t mode, compat_ulong_t __user *nmask,
1102                              compat_ulong_t maxnode, compat_ulong_t flags)
1103 {
1104         long err = 0;
1105         unsigned long __user *nm = NULL;
1106         unsigned long nr_bits, alloc_size;
1107         nodemask_t bm;
1108
1109         nr_bits = min_t(unsigned long, maxnode-1, MAX_NUMNODES);
1110         alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8;
1111
1112         if (nmask) {
1113                 err = compat_get_bitmap(nodes_addr(bm), nmask, nr_bits);
1114                 nm = compat_alloc_user_space(alloc_size);
1115                 err |= copy_to_user(nm, nodes_addr(bm), alloc_size);
1116         }
1117
1118         if (err)
1119                 return -EFAULT;
1120
1121         return sys_mbind(start, len, mode, nm, nr_bits+1, flags);
1122 }
1123
1124 #endif
1125
1126 /*
1127  * get_vma_policy(@task, @vma, @addr)
1128  * @task - task for fallback if vma policy == default
1129  * @vma   - virtual memory area whose policy is sought
1130  * @addr  - address in @vma for shared policy lookup
1131  *
1132  * Returns effective policy for a VMA at specified address.
1133  * Falls back to @task or system default policy, as necessary.
1134  * Returned policy has extra reference count if shared, vma,
1135  * or some other task's policy [show_numa_maps() can pass
1136  * @task != current].  It is the caller's responsibility to
1137  * free the reference in these cases.
1138  */
1139 static struct mempolicy * get_vma_policy(struct task_struct *task,
1140                 struct vm_area_struct *vma, unsigned long addr)
1141 {
1142         struct mempolicy *pol = task->mempolicy;
1143         int shared_pol = 0;
1144
1145         if (vma) {
1146                 if (vma->vm_ops && vma->vm_ops->get_policy) {
1147                         pol = vma->vm_ops->get_policy(vma, addr);
1148                         shared_pol = 1; /* if pol non-NULL, add ref below */
1149                 } else if (vma->vm_policy &&
1150                                 vma->vm_policy->policy != MPOL_DEFAULT)
1151                         pol = vma->vm_policy;
1152         }
1153         if (!pol)
1154                 pol = &default_policy;
1155         else if (!shared_pol && pol != current->mempolicy)
1156                 mpol_get(pol);  /* vma or other task's policy */
1157         return pol;
1158 }
1159
1160 /* Return a zonelist representing a mempolicy */
1161 static struct zonelist *zonelist_policy(gfp_t gfp, struct mempolicy *policy)
1162 {
1163         int nd;
1164
1165         switch (policy->policy) {
1166         case MPOL_PREFERRED:
1167                 nd = policy->v.preferred_node;
1168                 if (nd < 0)
1169                         nd = numa_node_id();
1170                 break;
1171         case MPOL_BIND:
1172                 /* Lower zones don't get a policy applied */
1173                 /* Careful: current->mems_allowed might have moved */
1174                 if (gfp_zone(gfp) >= policy_zone)
1175                         if (cpuset_zonelist_valid_mems_allowed(policy->v.zonelist))
1176                                 return policy->v.zonelist;
1177                 /*FALL THROUGH*/
1178         case MPOL_INTERLEAVE: /* should not happen */
1179         case MPOL_DEFAULT:
1180                 nd = numa_node_id();
1181                 break;
1182         default:
1183                 nd = 0;
1184                 BUG();
1185         }
1186         return NODE_DATA(nd)->node_zonelists + gfp_zone(gfp);
1187 }
1188
1189 /* Do dynamic interleaving for a process */
1190 static unsigned interleave_nodes(struct mempolicy *policy)
1191 {
1192         unsigned nid, next;
1193         struct task_struct *me = current;
1194
1195         nid = me->il_next;
1196         next = next_node(nid, policy->v.nodes);
1197         if (next >= MAX_NUMNODES)
1198                 next = first_node(policy->v.nodes);
1199         me->il_next = next;
1200         return nid;
1201 }
1202
1203 /*
1204  * Depending on the memory policy provide a node from which to allocate the
1205  * next slab entry.
1206  */
1207 unsigned slab_node(struct mempolicy *policy)
1208 {
1209         int pol = policy ? policy->policy : MPOL_DEFAULT;
1210
1211         switch (pol) {
1212         case MPOL_INTERLEAVE:
1213                 return interleave_nodes(policy);
1214
1215         case MPOL_BIND:
1216                 /*
1217                  * Follow bind policy behavior and start allocation at the
1218                  * first node.
1219                  */
1220                 return zone_to_nid(policy->v.zonelist->zones[0]);
1221
1222         case MPOL_PREFERRED:
1223                 if (policy->v.preferred_node >= 0)
1224                         return policy->v.preferred_node;
1225                 /* Fall through */
1226
1227         default:
1228                 return numa_node_id();
1229         }
1230 }
1231
1232 /* Do static interleaving for a VMA with known offset. */
1233 static unsigned offset_il_node(struct mempolicy *pol,
1234                 struct vm_area_struct *vma, unsigned long off)
1235 {
1236         unsigned nnodes = nodes_weight(pol->v.nodes);
1237         unsigned target = (unsigned)off % nnodes;
1238         int c;
1239         int nid = -1;
1240
1241         c = 0;
1242         do {
1243                 nid = next_node(nid, pol->v.nodes);
1244                 c++;
1245         } while (c <= target);
1246         return nid;
1247 }
1248
1249 /* Determine a node number for interleave */
1250 static inline unsigned interleave_nid(struct mempolicy *pol,
1251                  struct vm_area_struct *vma, unsigned long addr, int shift)
1252 {
1253         if (vma) {
1254                 unsigned long off;
1255
1256                 /*
1257                  * for small pages, there is no difference between
1258                  * shift and PAGE_SHIFT, so the bit-shift is safe.
1259                  * for huge pages, since vm_pgoff is in units of small
1260                  * pages, we need to shift off the always 0 bits to get
1261                  * a useful offset.
1262                  */
1263                 BUG_ON(shift < PAGE_SHIFT);
1264                 off = vma->vm_pgoff >> (shift - PAGE_SHIFT);
1265                 off += (addr - vma->vm_start) >> shift;
1266                 return offset_il_node(pol, vma, off);
1267         } else
1268                 return interleave_nodes(pol);
1269 }
1270
1271 #ifdef CONFIG_HUGETLBFS
1272 /*
1273  * huge_zonelist(@vma, @addr, @gfp_flags, @mpol)
1274  * @vma = virtual memory area whose policy is sought
1275  * @addr = address in @vma for shared policy lookup and interleave policy
1276  * @gfp_flags = for requested zone
1277  * @mpol = pointer to mempolicy pointer for reference counted 'BIND policy
1278  *
1279  * Returns a zonelist suitable for a huge page allocation.
1280  * If the effective policy is 'BIND, returns pointer to policy's zonelist.
1281  * If it is also a policy for which get_vma_policy() returns an extra
1282  * reference, we must hold that reference until after allocation.
1283  * In that case, return policy via @mpol so hugetlb allocation can drop
1284  * the reference.  For non-'BIND referenced policies, we can/do drop the
1285  * reference here, so the caller doesn't need to know about the special case
1286  * for default and current task policy.
1287  */
1288 struct zonelist *huge_zonelist(struct vm_area_struct *vma, unsigned long addr,
1289                                 gfp_t gfp_flags, struct mempolicy **mpol)
1290 {
1291         struct mempolicy *pol = get_vma_policy(current, vma, addr);
1292         struct zonelist *zl;
1293
1294         *mpol = NULL;           /* probably no unref needed */
1295         if (pol->policy == MPOL_INTERLEAVE) {
1296                 unsigned nid;
1297
1298                 nid = interleave_nid(pol, vma, addr, HPAGE_SHIFT);
1299 <<<<<<< HEAD:mm/mempolicy.c
1300                 __mpol_free(pol);               /* finished with pol */
1301 =======
1302                 if (unlikely(pol != &default_policy &&
1303                                 pol != current->mempolicy))
1304                         __mpol_free(pol);       /* finished with pol */
1305 >>>>>>> 264e3e889d86e552b4191d69bb60f4f3b383135a:mm/mempolicy.c
1306                 return NODE_DATA(nid)->node_zonelists + gfp_zone(gfp_flags);
1307         }
1308
1309         zl = zonelist_policy(GFP_HIGHUSER, pol);
1310         if (unlikely(pol != &default_policy && pol != current->mempolicy)) {
1311                 if (pol->policy != MPOL_BIND)
1312                         __mpol_free(pol);       /* finished with pol */
1313                 else
1314                         *mpol = pol;    /* unref needed after allocation */
1315         }
1316         return zl;
1317 }
1318 #endif
1319
1320 /* Allocate a page in interleaved policy.
1321    Own path because it needs to do special accounting. */
1322 static struct page *alloc_page_interleave(gfp_t gfp, unsigned order,
1323                                         unsigned nid)
1324 {
1325         struct zonelist *zl;
1326         struct page *page;
1327
1328         zl = NODE_DATA(nid)->node_zonelists + gfp_zone(gfp);
1329         page = __alloc_pages(gfp, order, zl);
1330         if (page && page_zone(page) == zl->zones[0])
1331                 inc_zone_page_state(page, NUMA_INTERLEAVE_HIT);
1332         return page;
1333 }
1334
1335 /**
1336  *      alloc_page_vma  - Allocate a page for a VMA.
1337  *
1338  *      @gfp:
1339  *      %GFP_USER    user allocation.
1340  *      %GFP_KERNEL  kernel allocations,
1341  *      %GFP_HIGHMEM highmem/user allocations,
1342  *      %GFP_FS      allocation should not call back into a file system.
1343  *      %GFP_ATOMIC  don't sleep.
1344  *
1345  *      @vma:  Pointer to VMA or NULL if not available.
1346  *      @addr: Virtual Address of the allocation. Must be inside the VMA.
1347  *
1348  *      This function allocates a page from the kernel page pool and applies
1349  *      a NUMA policy associated with the VMA or the current process.
1350  *      When VMA is not NULL caller must hold down_read on the mmap_sem of the
1351  *      mm_struct of the VMA to prevent it from going away. Should be used for
1352  *      all allocations for pages that will be mapped into
1353  *      user space. Returns NULL when no page can be allocated.
1354  *
1355  *      Should be called with the mm_sem of the vma hold.
1356  */
1357 struct page *
1358 alloc_page_vma(gfp_t gfp, struct vm_area_struct *vma, unsigned long addr)
1359 {
1360         struct mempolicy *pol = get_vma_policy(current, vma, addr);
1361         struct zonelist *zl;
1362
1363         cpuset_update_task_memory_state();
1364
1365         if (unlikely(pol->policy == MPOL_INTERLEAVE)) {
1366                 unsigned nid;
1367
1368                 nid = interleave_nid(pol, vma, addr, PAGE_SHIFT);
1369 <<<<<<< HEAD:mm/mempolicy.c
1370 =======
1371                 if (unlikely(pol != &default_policy &&
1372                                 pol != current->mempolicy))
1373                         __mpol_free(pol);       /* finished with pol */
1374 >>>>>>> 264e3e889d86e552b4191d69bb60f4f3b383135a:mm/mempolicy.c
1375                 return alloc_page_interleave(gfp, 0, nid);
1376         }
1377         zl = zonelist_policy(gfp, pol);
1378         if (pol != &default_policy && pol != current->mempolicy) {
1379                 /*
1380                  * slow path: ref counted policy -- shared or vma
1381                  */
1382                 struct page *page =  __alloc_pages(gfp, 0, zl);
1383                 __mpol_free(pol);
1384                 return page;
1385         }
1386         /*
1387          * fast path:  default or task policy
1388          */
1389         return __alloc_pages(gfp, 0, zl);
1390 }
1391
1392 /**
1393  *      alloc_pages_current - Allocate pages.
1394  *
1395  *      @gfp:
1396  *              %GFP_USER   user allocation,
1397  *              %GFP_KERNEL kernel allocation,
1398  *              %GFP_HIGHMEM highmem allocation,
1399  *              %GFP_FS     don't call back into a file system.
1400  *              %GFP_ATOMIC don't sleep.
1401  *      @order: Power of two of allocation size in pages. 0 is a single page.
1402  *
1403  *      Allocate a page from the kernel page pool.  When not in
1404  *      interrupt context and apply the current process NUMA policy.
1405  *      Returns NULL when no page can be allocated.
1406  *
1407  *      Don't call cpuset_update_task_memory_state() unless
1408  *      1) it's ok to take cpuset_sem (can WAIT), and
1409  *      2) allocating for current task (not interrupt).
1410  */
1411 struct page *alloc_pages_current(gfp_t gfp, unsigned order)
1412 {
1413         struct mempolicy *pol = current->mempolicy;
1414
1415         if ((gfp & __GFP_WAIT) && !in_interrupt())
1416                 cpuset_update_task_memory_state();
1417         if (!pol || in_interrupt() || (gfp & __GFP_THISNODE))
1418                 pol = &default_policy;
1419         if (pol->policy == MPOL_INTERLEAVE)
1420                 return alloc_page_interleave(gfp, order, interleave_nodes(pol));
1421         return __alloc_pages(gfp, order, zonelist_policy(gfp, pol));
1422 }
1423 EXPORT_SYMBOL(alloc_pages_current);
1424
1425 /*
1426  * If mpol_copy() sees current->cpuset == cpuset_being_rebound, then it
1427  * rebinds the mempolicy its copying by calling mpol_rebind_policy()
1428  * with the mems_allowed returned by cpuset_mems_allowed().  This
1429  * keeps mempolicies cpuset relative after its cpuset moves.  See
1430  * further kernel/cpuset.c update_nodemask().
1431  */
1432
1433 /* Slow path of a mempolicy copy */
1434 struct mempolicy *__mpol_copy(struct mempolicy *old)
1435 {
1436         struct mempolicy *new = kmem_cache_alloc(policy_cache, GFP_KERNEL);
1437
1438         if (!new)
1439                 return ERR_PTR(-ENOMEM);
1440         if (current_cpuset_is_being_rebound()) {
1441                 nodemask_t mems = cpuset_mems_allowed(current);
1442                 mpol_rebind_policy(old, &mems);
1443         }
1444         *new = *old;
1445         atomic_set(&new->refcnt, 1);
1446         if (new->policy == MPOL_BIND) {
1447                 int sz = ksize(old->v.zonelist);
1448                 new->v.zonelist = kmemdup(old->v.zonelist, sz, GFP_KERNEL);
1449                 if (!new->v.zonelist) {
1450                         kmem_cache_free(policy_cache, new);
1451                         return ERR_PTR(-ENOMEM);
1452                 }
1453         }
1454         return new;
1455 }
1456
1457 /* Slow path of a mempolicy comparison */
1458 int __mpol_equal(struct mempolicy *a, struct mempolicy *b)
1459 {
1460         if (!a || !b)
1461                 return 0;
1462         if (a->policy != b->policy)
1463                 return 0;
1464         switch (a->policy) {
1465         case MPOL_DEFAULT:
1466                 return 1;
1467         case MPOL_INTERLEAVE:
1468                 return nodes_equal(a->v.nodes, b->v.nodes);
1469         case MPOL_PREFERRED:
1470                 return a->v.preferred_node == b->v.preferred_node;
1471         case MPOL_BIND: {
1472                 int i;
1473                 for (i = 0; a->v.zonelist->zones[i]; i++)
1474                         if (a->v.zonelist->zones[i] != b->v.zonelist->zones[i])
1475                                 return 0;
1476                 return b->v.zonelist->zones[i] == NULL;
1477         }
1478         default:
1479                 BUG();
1480                 return 0;
1481         }
1482 }
1483
1484 /* Slow path of a mpol destructor. */
1485 void __mpol_free(struct mempolicy *p)
1486 {
1487         if (!atomic_dec_and_test(&p->refcnt))
1488                 return;
1489         if (p->policy == MPOL_BIND)
1490                 kfree(p->v.zonelist);
1491         p->policy = MPOL_DEFAULT;
1492         kmem_cache_free(policy_cache, p);
1493 }
1494
1495 /*
1496  * Shared memory backing store policy support.
1497  *
1498  * Remember policies even when nobody has shared memory mapped.
1499  * The policies are kept in Red-Black tree linked from the inode.
1500  * They are protected by the sp->lock spinlock, which should be held
1501  * for any accesses to the tree.
1502  */
1503
1504 /* lookup first element intersecting start-end */
1505 /* Caller holds sp->lock */
1506 static struct sp_node *
1507 sp_lookup(struct shared_policy *sp, unsigned long start, unsigned long end)
1508 {
1509         struct rb_node *n = sp->root.rb_node;
1510
1511         while (n) {
1512                 struct sp_node *p = rb_entry(n, struct sp_node, nd);
1513
1514                 if (start >= p->end)
1515                         n = n->rb_right;
1516                 else if (end <= p->start)
1517                         n = n->rb_left;
1518                 else
1519                         break;
1520         }
1521         if (!n)
1522                 return NULL;
1523         for (;;) {
1524                 struct sp_node *w = NULL;
1525                 struct rb_node *prev = rb_prev(n);
1526                 if (!prev)
1527                         break;
1528                 w = rb_entry(prev, struct sp_node, nd);
1529                 if (w->end <= start)
1530                         break;
1531                 n = prev;
1532         }
1533         return rb_entry(n, struct sp_node, nd);
1534 }
1535
1536 /* Insert a new shared policy into the list. */
1537 /* Caller holds sp->lock */
1538 static void sp_insert(struct shared_policy *sp, struct sp_node *new)
1539 {
1540         struct rb_node **p = &sp->root.rb_node;
1541         struct rb_node *parent = NULL;
1542         struct sp_node *nd;
1543
1544         while (*p) {
1545                 parent = *p;
1546                 nd = rb_entry(parent, struct sp_node, nd);
1547                 if (new->start < nd->start)
1548                         p = &(*p)->rb_left;
1549                 else if (new->end > nd->end)
1550                         p = &(*p)->rb_right;
1551                 else
1552                         BUG();
1553         }
1554         rb_link_node(&new->nd, parent, p);
1555         rb_insert_color(&new->nd, &sp->root);
1556         pr_debug("inserting %lx-%lx: %d\n", new->start, new->end,
1557                  new->policy ? new->policy->policy : 0);
1558 }
1559
1560 /* Find shared policy intersecting idx */
1561 struct mempolicy *
1562 mpol_shared_policy_lookup(struct shared_policy *sp, unsigned long idx)
1563 {
1564         struct mempolicy *pol = NULL;
1565         struct sp_node *sn;
1566
1567         if (!sp->root.rb_node)
1568                 return NULL;
1569         spin_lock(&sp->lock);
1570         sn = sp_lookup(sp, idx, idx+1);
1571         if (sn) {
1572                 mpol_get(sn->policy);
1573                 pol = sn->policy;
1574         }
1575         spin_unlock(&sp->lock);
1576         return pol;
1577 }
1578
1579 static void sp_delete(struct shared_policy *sp, struct sp_node *n)
1580 {
1581         pr_debug("deleting %lx-l%lx\n", n->start, n->end);
1582         rb_erase(&n->nd, &sp->root);
1583         mpol_free(n->policy);
1584         kmem_cache_free(sn_cache, n);
1585 }
1586
1587 static struct sp_node *sp_alloc(unsigned long start, unsigned long end,
1588                                 struct mempolicy *pol)
1589 {
1590         struct sp_node *n = kmem_cache_alloc(sn_cache, GFP_KERNEL);
1591
1592         if (!n)
1593                 return NULL;
1594         n->start = start;
1595         n->end = end;
1596         mpol_get(pol);
1597         n->policy = pol;
1598         return n;
1599 }
1600
1601 /* Replace a policy range. */
1602 static int shared_policy_replace(struct shared_policy *sp, unsigned long start,
1603                                  unsigned long end, struct sp_node *new)
1604 {
1605         struct sp_node *n, *new2 = NULL;
1606
1607 restart:
1608         spin_lock(&sp->lock);
1609         n = sp_lookup(sp, start, end);
1610         /* Take care of old policies in the same range. */
1611         while (n && n->start < end) {
1612                 struct rb_node *next = rb_next(&n->nd);
1613                 if (n->start >= start) {
1614                         if (n->end <= end)
1615                                 sp_delete(sp, n);
1616                         else
1617                                 n->start = end;
1618                 } else {
1619                         /* Old policy spanning whole new range. */
1620                         if (n->end > end) {
1621                                 if (!new2) {
1622                                         spin_unlock(&sp->lock);
1623                                         new2 = sp_alloc(end, n->end, n->policy);
1624                                         if (!new2)
1625                                                 return -ENOMEM;
1626                                         goto restart;
1627                                 }
1628                                 n->end = start;
1629                                 sp_insert(sp, new2);
1630                                 new2 = NULL;
1631                                 break;
1632                         } else
1633                                 n->end = start;
1634                 }
1635                 if (!next)
1636                         break;
1637                 n = rb_entry(next, struct sp_node, nd);
1638         }
1639         if (new)
1640                 sp_insert(sp, new);
1641         spin_unlock(&sp->lock);
1642         if (new2) {
1643                 mpol_free(new2->policy);
1644                 kmem_cache_free(sn_cache, new2);
1645         }
1646         return 0;
1647 }
1648
1649 void mpol_shared_policy_init(struct shared_policy *info, int policy,
1650                                 nodemask_t *policy_nodes)
1651 {
1652         info->root = RB_ROOT;
1653         spin_lock_init(&info->lock);
1654
1655         if (policy != MPOL_DEFAULT) {
1656                 struct mempolicy *newpol;
1657
1658                 /* Falls back to MPOL_DEFAULT on any error */
1659                 newpol = mpol_new(policy, policy_nodes);
1660                 if (!IS_ERR(newpol)) {
1661                         /* Create pseudo-vma that contains just the policy */
1662                         struct vm_area_struct pvma;
1663
1664                         memset(&pvma, 0, sizeof(struct vm_area_struct));
1665                         /* Policy covers entire file */
1666                         pvma.vm_end = TASK_SIZE;
1667                         mpol_set_shared_policy(info, &pvma, newpol);
1668                         mpol_free(newpol);
1669                 }
1670         }
1671 }
1672
1673 int mpol_set_shared_policy(struct shared_policy *info,
1674                         struct vm_area_struct *vma, struct mempolicy *npol)
1675 {
1676         int err;
1677         struct sp_node *new = NULL;
1678         unsigned long sz = vma_pages(vma);
1679
1680         pr_debug("set_shared_policy %lx sz %lu %d %lx\n",
1681                  vma->vm_pgoff,
1682                  sz, npol? npol->policy : -1,
1683                  npol ? nodes_addr(npol->v.nodes)[0] : -1);
1684
1685         if (npol) {
1686                 new = sp_alloc(vma->vm_pgoff, vma->vm_pgoff + sz, npol);
1687                 if (!new)
1688                         return -ENOMEM;
1689         }
1690         err = shared_policy_replace(info, vma->vm_pgoff, vma->vm_pgoff+sz, new);
1691         if (err && new)
1692                 kmem_cache_free(sn_cache, new);
1693         return err;
1694 }
1695
1696 /* Free a backing policy store on inode delete. */
1697 void mpol_free_shared_policy(struct shared_policy *p)
1698 {
1699         struct sp_node *n;
1700         struct rb_node *next;
1701
1702         if (!p->root.rb_node)
1703                 return;
1704         spin_lock(&p->lock);
1705         next = rb_first(&p->root);
1706         while (next) {
1707                 n = rb_entry(next, struct sp_node, nd);
1708                 next = rb_next(&n->nd);
1709                 rb_erase(&n->nd, &p->root);
1710                 mpol_free(n->policy);
1711                 kmem_cache_free(sn_cache, n);
1712         }
1713         spin_unlock(&p->lock);
1714 }
1715
1716 /* assumes fs == KERNEL_DS */
1717 void __init numa_policy_init(void)
1718 {
1719         nodemask_t interleave_nodes;
1720         unsigned long largest = 0;
1721         int nid, prefer = 0;
1722
1723         policy_cache = kmem_cache_create("numa_policy",
1724                                          sizeof(struct mempolicy),
1725                                          0, SLAB_PANIC, NULL);
1726
1727         sn_cache = kmem_cache_create("shared_policy_node",
1728                                      sizeof(struct sp_node),
1729                                      0, SLAB_PANIC, NULL);
1730
1731         /*
1732          * Set interleaving policy for system init. Interleaving is only
1733          * enabled across suitably sized nodes (default is >= 16MB), or
1734          * fall back to the largest node if they're all smaller.
1735          */
1736         nodes_clear(interleave_nodes);
1737         for_each_node_state(nid, N_HIGH_MEMORY) {
1738                 unsigned long total_pages = node_present_pages(nid);
1739
1740                 /* Preserve the largest node */
1741                 if (largest < total_pages) {
1742                         largest = total_pages;
1743                         prefer = nid;
1744                 }
1745
1746                 /* Interleave this node? */
1747                 if ((total_pages << PAGE_SHIFT) >= (16 << 20))
1748                         node_set(nid, interleave_nodes);
1749         }
1750
1751         /* All too small, use the largest */
1752         if (unlikely(nodes_empty(interleave_nodes)))
1753                 node_set(prefer, interleave_nodes);
1754
1755         if (do_set_mempolicy(MPOL_INTERLEAVE, &interleave_nodes))
1756                 printk("numa_policy_init: interleaving failed\n");
1757 }
1758
1759 /* Reset policy of current process to default */
1760 void numa_default_policy(void)
1761 {
1762         do_set_mempolicy(MPOL_DEFAULT, NULL);
1763 }
1764
1765 /* Migrate a policy to a different set of nodes */
1766 static void mpol_rebind_policy(struct mempolicy *pol,
1767                                const nodemask_t *newmask)
1768 {
1769         nodemask_t *mpolmask;
1770         nodemask_t tmp;
1771
1772         if (!pol)
1773                 return;
1774         mpolmask = &pol->cpuset_mems_allowed;
1775         if (nodes_equal(*mpolmask, *newmask))
1776                 return;
1777
1778         switch (pol->policy) {
1779         case MPOL_DEFAULT:
1780                 break;
1781         case MPOL_INTERLEAVE:
1782                 nodes_remap(tmp, pol->v.nodes, *mpolmask, *newmask);
1783                 pol->v.nodes = tmp;
1784                 *mpolmask = *newmask;
1785                 current->il_next = node_remap(current->il_next,
1786                                                 *mpolmask, *newmask);
1787                 break;
1788         case MPOL_PREFERRED:
1789                 pol->v.preferred_node = node_remap(pol->v.preferred_node,
1790                                                 *mpolmask, *newmask);
1791                 *mpolmask = *newmask;
1792                 break;
1793         case MPOL_BIND: {
1794                 nodemask_t nodes;
1795                 struct zone **z;
1796                 struct zonelist *zonelist;
1797
1798                 nodes_clear(nodes);
1799                 for (z = pol->v.zonelist->zones; *z; z++)
1800                         node_set(zone_to_nid(*z), nodes);
1801                 nodes_remap(tmp, nodes, *mpolmask, *newmask);
1802                 nodes = tmp;
1803
1804                 zonelist = bind_zonelist(&nodes);
1805
1806                 /* If no mem, then zonelist is NULL and we keep old zonelist.
1807                  * If that old zonelist has no remaining mems_allowed nodes,
1808                  * then zonelist_policy() will "FALL THROUGH" to MPOL_DEFAULT.
1809                  */
1810
1811                 if (!IS_ERR(zonelist)) {
1812                         /* Good - got mem - substitute new zonelist */
1813                         kfree(pol->v.zonelist);
1814                         pol->v.zonelist = zonelist;
1815                 }
1816                 *mpolmask = *newmask;
1817                 break;
1818         }
1819         default:
1820                 BUG();
1821                 break;
1822         }
1823 }
1824
1825 /*
1826  * Wrapper for mpol_rebind_policy() that just requires task
1827  * pointer, and updates task mempolicy.
1828  */
1829
1830 void mpol_rebind_task(struct task_struct *tsk, const nodemask_t *new)
1831 {
1832         mpol_rebind_policy(tsk->mempolicy, new);
1833 }
1834
1835 /*
1836  * Rebind each vma in mm to new nodemask.
1837  *
1838  * Call holding a reference to mm.  Takes mm->mmap_sem during call.
1839  */
1840
1841 void mpol_rebind_mm(struct mm_struct *mm, nodemask_t *new)
1842 {
1843         struct vm_area_struct *vma;
1844
1845         down_write(&mm->mmap_sem);
1846         for (vma = mm->mmap; vma; vma = vma->vm_next)
1847                 mpol_rebind_policy(vma->vm_policy, new);
1848         up_write(&mm->mmap_sem);
1849 }
1850
1851 /*
1852  * Display pages allocated per node and memory policy via /proc.
1853  */
1854
1855 static const char * const policy_types[] =
1856         { "default", "prefer", "bind", "interleave" };
1857
1858 /*
1859  * Convert a mempolicy into a string.
1860  * Returns the number of characters in buffer (if positive)
1861  * or an error (negative)
1862  */
1863 static inline int mpol_to_str(char *buffer, int maxlen, struct mempolicy *pol)
1864 {
1865         char *p = buffer;
1866         int l;
1867         nodemask_t nodes;
1868         int mode = pol ? pol->policy : MPOL_DEFAULT;
1869
1870         switch (mode) {
1871         case MPOL_DEFAULT:
1872                 nodes_clear(nodes);
1873                 break;
1874
1875         case MPOL_PREFERRED:
1876                 nodes_clear(nodes);
1877                 node_set(pol->v.preferred_node, nodes);
1878                 break;
1879
1880         case MPOL_BIND:
1881                 get_zonemask(pol, &nodes);
1882                 break;
1883
1884         case MPOL_INTERLEAVE:
1885                 nodes = pol->v.nodes;
1886                 break;
1887
1888         default:
1889                 BUG();
1890                 return -EFAULT;
1891         }
1892
1893         l = strlen(policy_types[mode]);
1894         if (buffer + maxlen < p + l + 1)
1895                 return -ENOSPC;
1896
1897         strcpy(p, policy_types[mode]);
1898         p += l;
1899
1900         if (!nodes_empty(nodes)) {
1901                 if (buffer + maxlen < p + 2)
1902                         return -ENOSPC;
1903                 *p++ = '=';
1904                 p += nodelist_scnprintf(p, buffer + maxlen - p, nodes);
1905         }
1906         return p - buffer;
1907 }
1908
1909 struct numa_maps {
1910         unsigned long pages;
1911         unsigned long anon;
1912         unsigned long active;
1913         unsigned long writeback;
1914         unsigned long mapcount_max;
1915         unsigned long dirty;
1916         unsigned long swapcache;
1917         unsigned long node[MAX_NUMNODES];
1918 };
1919
1920 static void gather_stats(struct page *page, void *private, int pte_dirty)
1921 {
1922         struct numa_maps *md = private;
1923         int count = page_mapcount(page);
1924
1925         md->pages++;
1926         if (pte_dirty || PageDirty(page))
1927                 md->dirty++;
1928
1929         if (PageSwapCache(page))
1930                 md->swapcache++;
1931
1932         if (PageActive(page))
1933                 md->active++;
1934
1935         if (PageWriteback(page))
1936                 md->writeback++;
1937
1938         if (PageAnon(page))
1939                 md->anon++;
1940
1941         if (count > md->mapcount_max)
1942                 md->mapcount_max = count;
1943
1944         md->node[page_to_nid(page)]++;
1945 }
1946
1947 #ifdef CONFIG_HUGETLB_PAGE
1948 static void check_huge_range(struct vm_area_struct *vma,
1949                 unsigned long start, unsigned long end,
1950                 struct numa_maps *md)
1951 {
1952         unsigned long addr;
1953         struct page *page;
1954
1955         for (addr = start; addr < end; addr += HPAGE_SIZE) {
1956                 pte_t *ptep = huge_pte_offset(vma->vm_mm, addr & HPAGE_MASK);
1957                 pte_t pte;
1958
1959                 if (!ptep)
1960                         continue;
1961
1962                 pte = *ptep;
1963                 if (pte_none(pte))
1964                         continue;
1965
1966                 page = pte_page(pte);
1967                 if (!page)
1968                         continue;
1969
1970                 gather_stats(page, md, pte_dirty(*ptep));
1971         }
1972 }
1973 #else
1974 static inline void check_huge_range(struct vm_area_struct *vma,
1975                 unsigned long start, unsigned long end,
1976                 struct numa_maps *md)
1977 {
1978 }
1979 #endif
1980
1981 int show_numa_map(struct seq_file *m, void *v)
1982 {
1983         struct proc_maps_private *priv = m->private;
1984         struct vm_area_struct *vma = v;
1985         struct numa_maps *md;
1986         struct file *file = vma->vm_file;
1987         struct mm_struct *mm = vma->vm_mm;
1988         struct mempolicy *pol;
1989         int n;
1990         char buffer[50];
1991
1992         if (!mm)
1993                 return 0;
1994
1995         md = kzalloc(sizeof(struct numa_maps), GFP_KERNEL);
1996         if (!md)
1997                 return 0;
1998
1999         pol = get_vma_policy(priv->task, vma, vma->vm_start);
2000         mpol_to_str(buffer, sizeof(buffer), pol);
2001         /*
2002          * unref shared or other task's mempolicy
2003          */
2004         if (pol != &default_policy && pol != current->mempolicy)
2005                 __mpol_free(pol);
2006
2007         seq_printf(m, "%08lx %s", vma->vm_start, buffer);
2008
2009         if (file) {
2010                 seq_printf(m, " file=");
2011                 seq_path(m, &file->f_path, "\n\t= ");
2012         } else if (vma->vm_start <= mm->brk && vma->vm_end >= mm->start_brk) {
2013                 seq_printf(m, " heap");
2014         } else if (vma->vm_start <= mm->start_stack &&
2015                         vma->vm_end >= mm->start_stack) {
2016                 seq_printf(m, " stack");
2017         }
2018
2019         if (is_vm_hugetlb_page(vma)) {
2020                 check_huge_range(vma, vma->vm_start, vma->vm_end, md);
2021                 seq_printf(m, " huge");
2022         } else {
2023                 check_pgd_range(vma, vma->vm_start, vma->vm_end,
2024                         &node_states[N_HIGH_MEMORY], MPOL_MF_STATS, md);
2025         }
2026
2027         if (!md->pages)
2028                 goto out;
2029
2030         if (md->anon)
2031                 seq_printf(m," anon=%lu",md->anon);
2032
2033         if (md->dirty)
2034                 seq_printf(m," dirty=%lu",md->dirty);
2035
2036         if (md->pages != md->anon && md->pages != md->dirty)
2037                 seq_printf(m, " mapped=%lu", md->pages);
2038
2039         if (md->mapcount_max > 1)
2040                 seq_printf(m, " mapmax=%lu", md->mapcount_max);
2041
2042         if (md->swapcache)
2043                 seq_printf(m," swapcache=%lu", md->swapcache);
2044
2045         if (md->active < md->pages && !is_vm_hugetlb_page(vma))
2046                 seq_printf(m," active=%lu", md->active);
2047
2048         if (md->writeback)
2049                 seq_printf(m," writeback=%lu", md->writeback);
2050
2051         for_each_node_state(n, N_HIGH_MEMORY)
2052                 if (md->node[n])
2053                         seq_printf(m, " N%d=%lu", n, md->node[n]);
2054 out:
2055         seq_putc(m, '\n');
2056         kfree(md);
2057
2058         if (m->count < m->size)
2059                 m->version = (vma != priv->tail_vma) ? vma->vm_start : 0;
2060         return 0;
2061 }