mm/migrate.c

   1 /*
   2  * Memory Migration functionality - linux/mm/migration.c
   3  *
   4  * Copyright (C) 2006 Silicon Graphics, Inc., Christoph Lameter
   5  *
   6  * Page migration was first developed in the context of the memory hotplug
   7  * project. The main authors of the migration code are:
   8  *
   9  * IWAMOTO Toshihiro <iwamoto@valinux.co.jp>
  10  * Hirokazu Takahashi <taka@valinux.co.jp>
  11  * Dave Hansen <haveblue@us.ibm.com>
  12  * Christoph Lameter <clameter@sgi.com>
  13  */
  14
  15 #include <linux/migrate.h>
  16 #include <linux/module.h>
  17 #include <linux/swap.h>
  18 #include <linux/swapops.h>
  19 #include <linux/pagemap.h>
  20 #include <linux/buffer_head.h>
  21 #include <linux/mm_inline.h>
  22 #include <linux/pagevec.h>
  23 #include <linux/rmap.h>
  24 #include <linux/topology.h>
  25 #include <linux/cpu.h>
  26 #include <linux/cpuset.h>
  27 #include <linux/writeback.h>
  28
  29 #include "internal.h"
  30
  31 /* The maximum number of pages to take off the LRU for migration */
  32 #define MIGRATE_CHUNK_SIZE 256
  33
  34 #define lru_to_page(_head) (list_entry((_head)->prev, struct page, lru))
  35
  36 /*
  37  * Isolate one page from the LRU lists. If successful put it onto
  38  * the indicated list with elevated page count.
  39  *
  40  * Result:
  41  *  -EBUSY: page not on LRU list
  42  *  0: page removed from LRU list and added to the specified list.
  43  */
  44 int isolate_lru_page(struct page *page, struct list_head *pagelist)
  45 {
  46         int ret = -EBUSY;
  47
  48         if (PageLRU(page)) {
  49                 struct zone *zone = page_zone(page);
  50
  51                 spin_lock_irq(&zone->lru_lock);
  52                 if (PageLRU(page)) {
  53                         ret = 0;
  54                         get_page(page);
  55                         ClearPageLRU(page);
  56                         if (PageActive(page))
  57                                 del_page_from_active_list(zone, page);
  58                         else
  59                                 del_page_from_inactive_list(zone, page);
  60                         list_add_tail(&page->lru, pagelist);
  61                 }
  62                 spin_unlock_irq(&zone->lru_lock);
  63         }
  64         return ret;
  65 }
  66
  67 /*
  68  * migrate_prep() needs to be called after we have compiled the list of pages
  69  * to be migrated using isolate_lru_page() but before we begin a series of calls
  70  * to migrate_pages().
  71  */
  72 int migrate_prep(void)
  73 {
  74         /*
  75          * Clear the LRU lists so pages can be isolated.
  76          * Note that pages may be moved off the LRU after we have
  77          * drained them. Those pages will fail to migrate like other
  78          * pages that may be busy.
  79          */
  80         lru_add_drain_all();
  81
  82         return 0;
  83 }
  84
  85 static inline void move_to_lru(struct page *page)
  86 {
  87         list_del(&page->lru);
  88         if (PageActive(page)) {
  89                 /*
  90                  * lru_cache_add_active checks that
  91                  * the PG_active bit is off.
  92                  */
  93                 ClearPageActive(page);
  94                 lru_cache_add_active(page);
  95         } else {
  96                 lru_cache_add(page);
  97         }
  98         put_page(page);
  99 }
 100
 101 /*
 102  * Add isolated pages on the list back to the LRU.
 103  *
 104  * returns the number of pages put back.
 105  */
 106 int putback_lru_pages(struct list_head *l)
 107 {
 108         struct page *page;
 109         struct page *page2;
 110         int count = 0;
 111
 112         list_for_each_entry_safe(page, page2, l, lru) {
 113                 move_to_lru(page);
 114                 count++;
 115         }
 116         return count;
 117 }
 118
 119 static inline int is_swap_pte(pte_t pte)
 120 {
 121         return !pte_none(pte) && !pte_present(pte) && !pte_file(pte);
 122 }
 123
 124 /*
 125  * Restore a potential migration pte to a working pte entry
 126  */
 127 static void remove_migration_pte(struct vm_area_struct *vma,
 128                 struct page *old, struct page *new)
 129 {
 130         struct mm_struct *mm = vma->vm_mm;
 131         swp_entry_t entry;
 132         pgd_t *pgd;
 133         pud_t *pud;
 134         pmd_t *pmd;
 135         pte_t *ptep, pte;
 136         spinlock_t *ptl;
 137         unsigned long addr = page_address_in_vma(new, vma);
 138
 139         if (addr == -EFAULT)
 140                 return;
 141
 142         pgd = pgd_offset(mm, addr);
 143         if (!pgd_present(*pgd))
 144                 return;
 145
 146         pud = pud_offset(pgd, addr);
 147         if (!pud_present(*pud))
 148                 return;
 149
 150         pmd = pmd_offset(pud, addr);
 151         if (!pmd_present(*pmd))
 152                 return;
 153
 154         ptep = pte_offset_map(pmd, addr);
 155
 156         if (!is_swap_pte(*ptep)) {
 157                 pte_unmap(ptep);
 158                 return;
 159         }
 160
 161         ptl = pte_lockptr(mm, pmd);
 162         spin_lock(ptl);
 163         pte = *ptep;
 164         if (!is_swap_pte(pte))
 165                 goto out;
 166
 167         entry = pte_to_swp_entry(pte);
 168
 169         if (!is_migration_entry(entry) || migration_entry_to_page(entry) != old)
 170                 goto out;
 171
 172         get_page(new);
 173         pte = pte_mkold(mk_pte(new, vma->vm_page_prot));
 174         if (is_write_migration_entry(entry))
 175                 pte = pte_mkwrite(pte);
 176         set_pte_at(mm, addr, ptep, pte);
 177
 178         if (PageAnon(new))
 179                 page_add_anon_rmap(new, vma, addr);
 180         else
 181                 page_add_file_rmap(new);
 182
 183         /* No need to invalidate - it was non-present before */
 184         update_mmu_cache(vma, addr, pte);
 185         lazy_mmu_prot_update(pte);
 186
 187 out:
 188         pte_unmap_unlock(ptep, ptl);
 189 }
 190
 191 /*
 192  * Note that remove_file_migration_ptes will only work on regular mappings,
 193  * Nonlinear mappings do not use migration entries.
 194  */
 195 static void remove_file_migration_ptes(struct page *old, struct page *new)
 196 {
 197         struct vm_area_struct *vma;
 198         struct address_space *mapping = page_mapping(new);
 199         struct prio_tree_iter iter;
 200         pgoff_t pgoff = new->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT);
 201
 202         if (!mapping)
 203                 return;
 204
 205         spin_lock(&mapping->i_mmap_lock);
 206
 207         vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, pgoff, pgoff)
 208                 remove_migration_pte(vma, old, new);
 209
 210         spin_unlock(&mapping->i_mmap_lock);
 211 }
 212
 213 /*
 214  * Must hold mmap_sem lock on at least one of the vmas containing
 215  * the page so that the anon_vma cannot vanish.
 216  */
 217 static void remove_anon_migration_ptes(struct page *old, struct page *new)
 218 {
 219         struct anon_vma *anon_vma;
 220         struct vm_area_struct *vma;
 221         unsigned long mapping;
 222
 223         mapping = (unsigned long)new->mapping;
 224
 225         if (!mapping || (mapping & PAGE_MAPPING_ANON) == 0)
 226                 return;
 227
 228         /*
 229          * We hold the mmap_sem lock. So no need to call page_lock_anon_vma.
 230          */
 231         anon_vma = (struct anon_vma *) (mapping - PAGE_MAPPING_ANON);
 232         spin_lock(&anon_vma->lock);
 233
 234         list_for_each_entry(vma, &anon_vma->head, anon_vma_node)
 235                 remove_migration_pte(vma, old, new);
 236
 237         spin_unlock(&anon_vma->lock);
 238 }
 239
 240 /*
 241  * Get rid of all migration entries and replace them by
 242  * references to the indicated page.
 243  */
 244 static void remove_migration_ptes(struct page *old, struct page *new)
 245 {
 246         if (PageAnon(new))
 247                 remove_anon_migration_ptes(old, new);
 248         else
 249                 remove_file_migration_ptes(old, new);
 250 }
 251
 252 /*
 253  * Something used the pte of a page under migration. We need to
 254  * get to the page and wait until migration is finished.
 255  * When we return from this function the fault will be retried.
 256  *
 257  * This function is called from do_swap_page().
 258  */
 259 void migration_entry_wait(struct mm_struct *mm, pmd_t *pmd,
 260                                 unsigned long address)
 261 {
 262         pte_t *ptep, pte;
 263         spinlock_t *ptl;
 264         swp_entry_t entry;
 265         struct page *page;
 266
 267         ptep = pte_offset_map_lock(mm, pmd, address, &ptl);
 268         pte = *ptep;
 269         if (!is_swap_pte(pte))
 270                 goto out;
 271
 272         entry = pte_to_swp_entry(pte);
 273         if (!is_migration_entry(entry))
 274                 goto out;
 275
 276         page = migration_entry_to_page(entry);
 277
 278         get_page(page);
 279         pte_unmap_unlock(ptep, ptl);
 280         wait_on_page_locked(page);
 281         put_page(page);
 282         return;
 283 out:
 284         pte_unmap_unlock(ptep, ptl);
 285 }
 286
 287 /*
 288  * Replace the page in the mapping.
 289  *
 290  * The number of remaining references must be:
 291  * 1 for anonymous pages without a mapping
 292  * 2 for pages with a mapping
 293  * 3 for pages with a mapping and PagePrivate set.
 294  */
 295 static int migrate_page_move_mapping(struct address_space *mapping,
 296                 struct page *newpage, struct page *page)
 297 {
 298         struct page **radix_pointer;
 299
 300         if (!mapping) {
 301                 /* Anonymous page */
 302                 if (page_count(page) != 1)
 303                         return -EAGAIN;
 304                 return 0;
 305         }
 306
 307         write_lock_irq(&mapping->tree_lock);
 308
 309         radix_pointer = (struct page **)radix_tree_lookup_slot(
 310                                                 &mapping->page_tree,
 311                                                 page_index(page));
 312
 313         if (page_count(page) != 2 + !!PagePrivate(page) ||
 314                         *radix_pointer != page) {
 315                 write_unlock_irq(&mapping->tree_lock);
 316                 return -EAGAIN;
 317         }
 318
 319         /*
 320          * Now we know that no one else is looking at the page.
 321          */
 322         get_page(newpage);
 323 #ifdef CONFIG_SWAP
 324         if (PageSwapCache(page)) {
 325                 SetPageSwapCache(newpage);
 326                 set_page_private(newpage, page_private(page));
 327         }
 328 #endif
 329
 330         *radix_pointer = newpage;
 331         __put_page(page);
 332         write_unlock_irq(&mapping->tree_lock);
 333
 334         return 0;
 335 }
 336
 337 /*
 338  * Copy the page to its new location
 339  */
 340 static void migrate_page_copy(struct page *newpage, struct page *page)
 341 {
 342         copy_highpage(newpage, page);
 343
 344         if (PageError(page))
 345                 SetPageError(newpage);
 346         if (PageReferenced(page))
 347                 SetPageReferenced(newpage);
 348         if (PageUptodate(page))
 349                 SetPageUptodate(newpage);
 350         if (PageActive(page))
 351                 SetPageActive(newpage);
 352         if (PageChecked(page))
 353                 SetPageChecked(newpage);
 354         if (PageMappedToDisk(page))
 355                 SetPageMappedToDisk(newpage);
 356
 357         if (PageDirty(page)) {
 358                 clear_page_dirty_for_io(page);
 359                 set_page_dirty(newpage);
 360         }
 361
 362 #ifdef CONFIG_SWAP
 363         ClearPageSwapCache(page);
 364 #endif
 365         ClearPageActive(page);
 366         ClearPagePrivate(page);
 367         set_page_private(page, 0);
 368         page->mapping = NULL;
 369
 370         /*
 371          * If any waiters have accumulated on the new page then
 372          * wake them up.
 373          */
 374         if (PageWriteback(newpage))
 375                 end_page_writeback(newpage);
 376 }
 377
 378 /************************************************************
 379  *                    Migration functions
 380  ***********************************************************/
 381
 382 /* Always fail migration. Used for mappings that are not movable */
 383 int fail_migrate_page(struct address_space *mapping,
 384                         struct page *newpage, struct page *page)
 385 {
 386         return -EIO;
 387 }
 388 EXPORT_SYMBOL(fail_migrate_page);
 389
 390 /*
 391  * Common logic to directly migrate a single page suitable for
 392  * pages that do not use PagePrivate.
 393  *
 394  * Pages are locked upon entry and exit.
 395  */
 396 int migrate_page(struct address_space *mapping,
 397                 struct page *newpage, struct page *page)
 398 {
 399         int rc;
 400
 401         BUG_ON(PageWriteback(page));    /* Writeback must be complete */
 402
 403         rc = migrate_page_move_mapping(mapping, newpage, page);
 404
 405         if (rc)
 406                 return rc;
 407
 408         migrate_page_copy(newpage, page);
 409         return 0;
 410 }
 411 EXPORT_SYMBOL(migrate_page);
 412
 413 /*
 414  * Migration function for pages with buffers. This function can only be used
 415  * if the underlying filesystem guarantees that no other references to "page"
 416  * exist.
 417  */
 418 int buffer_migrate_page(struct address_space *mapping,
 419                 struct page *newpage, struct page *page)
 420 {
 421         struct buffer_head *bh, *head;
 422         int rc;
 423
 424         if (!page_has_buffers(page))
 425                 return migrate_page(mapping, newpage, page);
 426
 427         head = page_buffers(page);
 428
 429         rc = migrate_page_move_mapping(mapping, newpage, page);
 430
 431         if (rc)
 432                 return rc;
 433
 434         bh = head;
 435         do {
 436                 get_bh(bh);
 437                 lock_buffer(bh);
 438                 bh = bh->b_this_page;
 439
 440         } while (bh != head);
 441
 442         ClearPagePrivate(page);
 443         set_page_private(newpage, page_private(page));
 444         set_page_private(page, 0);
 445         put_page(page);
 446         get_page(newpage);
 447
 448         bh = head;
 449         do {
 450                 set_bh_page(bh, newpage, bh_offset(bh));
 451                 bh = bh->b_this_page;
 452
 453         } while (bh != head);
 454
 455         SetPagePrivate(newpage);
 456
 457         migrate_page_copy(newpage, page);
 458
 459         bh = head;
 460         do {
 461                 unlock_buffer(bh);
 462                 put_bh(bh);
 463                 bh = bh->b_this_page;
 464
 465         } while (bh != head);
 466
 467         return 0;
 468 }
 469 EXPORT_SYMBOL(buffer_migrate_page);
 470
 471 /*
 472  * Writeback a page to clean the dirty state
 473  */
 474 static int writeout(struct address_space *mapping, struct page *page)
 475 {
 476         struct writeback_control wbc = {
 477                 .sync_mode = WB_SYNC_NONE,
 478                 .nr_to_write = 1,
 479                 .range_start = 0,
 480                 .range_end = LLONG_MAX,
 481                 .nonblocking = 1,
 482                 .for_reclaim = 1
 483         };
 484         int rc;
 485
 486         if (!mapping->a_ops->writepage)
 487                 /* No write method for the address space */
 488                 return -EINVAL;
 489
 490         if (!clear_page_dirty_for_io(page))
 491                 /* Someone else already triggered a write */
 492                 return -EAGAIN;
 493
 494         /*
 495          * A dirty page may imply that the underlying filesystem has
 496          * the page on some queue. So the page must be clean for
 497          * migration. Writeout may mean we loose the lock and the
 498          * page state is no longer what we checked for earlier.
 499          * At this point we know that the migration attempt cannot
 500          * be successful.
 501          */
 502         remove_migration_ptes(page, page);
 503
 504         rc = mapping->a_ops->writepage(page, &wbc);
 505         if (rc < 0)
 506                 /* I/O Error writing */
 507                 return -EIO;
 508
 509         if (rc != AOP_WRITEPAGE_ACTIVATE)
 510                 /* unlocked. Relock */
 511                 lock_page(page);
 512
 513         return -EAGAIN;
 514 }
 515
 516 /*
 517  * Default handling if a filesystem does not provide a migration function.
 518  */
 519 static int fallback_migrate_page(struct address_space *mapping,
 520         struct page *newpage, struct page *page)
 521 {
 522         if (PageDirty(page))
 523                 return writeout(mapping, page);
 524
 525         /*
 526          * Buffers may be managed in a filesystem specific way.
 527          * We must have no buffers or drop them.
 528          */
 529         if (page_has_buffers(page) &&
 530             !try_to_release_page(page, GFP_KERNEL))
 531                 return -EAGAIN;
 532
 533         return migrate_page(mapping, newpage, page);
 534 }
 535
 536 /*
 537  * migrate_pages
 538  *
 539  * Two lists are passed to this function. The first list
 540  * contains the pages isolated from the LRU to be migrated.
 541  * The second list contains new pages that the pages isolated
 542  * can be moved to.
 543  *
 544  * The function returns after 10 attempts or if no pages
 545  * are movable anymore because to has become empty
 546  * or no retryable pages exist anymore.
 547  *
 548  * Return: Number of pages not migrated when "to" ran empty.
 549  */
 550 int migrate_pages(struct list_head *from, struct list_head *to,
 551                   struct list_head *moved, struct list_head *failed)
 552 {
 553         int retry;
 554         int nr_failed = 0;
 555         int pass = 0;
 556         struct page *page;
 557         struct page *page2;
 558         int swapwrite = current->flags & PF_SWAPWRITE;
 559         int rc;
 560
 561         if (!swapwrite)
 562                 current->flags |= PF_SWAPWRITE;
 563
 564 redo:
 565         retry = 0;
 566
 567         list_for_each_entry_safe(page, page2, from, lru) {
 568                 struct page *newpage = NULL;
 569                 struct address_space *mapping;
 570
 571                 cond_resched();
 572
 573                 rc = 0;
 574                 if (page_count(page) == 1)
 575                         /* page was freed from under us. So we are done. */
 576                         goto next;
 577
 578                 if (to && list_empty(to))
 579                         break;
 580
 581                 /*
 582                  * Skip locked pages during the first two passes to give the
 583                  * functions holding the lock time to release the page. Later we
 584                  * use lock_page() to have a higher chance of acquiring the
 585                  * lock.
 586                  */
 587                 rc = -EAGAIN;
 588                 if (pass > 2)
 589                         lock_page(page);
 590                 else
 591                         if (TestSetPageLocked(page))
 592                                 goto next;
 593
 594                 /*
 595                  * Only wait on writeback if we have already done a pass where
 596                  * we we may have triggered writeouts for lots of pages.
 597                  */
 598                 if (pass > 0)
 599                         wait_on_page_writeback(page);
 600                 else
 601                         if (PageWriteback(page))
 602                                 goto unlock_page;
 603
 604                 /*
 605                  * Establish migration ptes or remove ptes
 606                  */
 607                 rc = -EPERM;
 608                 if (try_to_unmap(page, 1) == SWAP_FAIL)
 609                         /* A vma has VM_LOCKED set -> permanent failure */
 610                         goto unlock_page;
 611
 612                 rc = -EAGAIN;
 613                 if (page_mapped(page))
 614                         goto unlock_page;
 615
 616                 newpage = lru_to_page(to);
 617                 lock_page(newpage);
 618                 /* Prepare mapping for the new page.*/
 619                 newpage->index = page->index;
 620                 newpage->mapping = page->mapping;
 621
 622                 /*
 623                  * Pages are properly locked and writeback is complete.
 624                  * Try to migrate the page.
 625                  */
 626                 mapping = page_mapping(page);
 627                 if (!mapping)
 628                         rc = migrate_page(mapping, newpage, page);
 629
 630                 else if (mapping->a_ops->migratepage)
 631                         /*
 632                          * Most pages have a mapping and most filesystems
 633                          * should provide a migration function. Anonymous
 634                          * pages are part of swap space which also has its
 635                          * own migration function. This is the most common
 636                          * path for page migration.
 637                          */
 638                         rc = mapping->a_ops->migratepage(mapping,
 639                                                         newpage, page);
 640                 else
 641                         rc = fallback_migrate_page(mapping, newpage, page);
 642
 643                 if (!rc)
 644                         remove_migration_ptes(page, newpage);
 645
 646                 unlock_page(newpage);
 647
 648 unlock_page:
 649                 if (rc)
 650                         remove_migration_ptes(page, page);
 651
 652                 unlock_page(page);
 653
 654 next:
 655                 if (rc) {
 656                         if (newpage)
 657                                 newpage->mapping = NULL;
 658
 659                         if (rc == -EAGAIN)
 660                                 retry++;
 661                         else {
 662                                 /* Permanent failure */
 663                                 list_move(&page->lru, failed);
 664                                 nr_failed++;
 665                         }
 666                 } else {
 667                         if (newpage) {
 668                                 /* Successful migration. Return page to LRU */
 669                                 move_to_lru(newpage);
 670                         }
 671                         list_move(&page->lru, moved);
 672                 }
 673         }
 674         if (retry && pass++ < 10)
 675                 goto redo;
 676
 677         if (!swapwrite)
 678                 current->flags &= ~PF_SWAPWRITE;
 679
 680         return nr_failed + retry;
 681 }
 682
 683 /*
 684  * Migrate the list 'pagelist' of pages to a certain destination.
 685  *
 686  * Specify destination with either non-NULL vma or dest_node >= 0
 687  * Return the number of pages not migrated or error code
 688  */
 689 int migrate_pages_to(struct list_head *pagelist,
 690                         struct vm_area_struct *vma, int dest)
 691 {
 692         LIST_HEAD(newlist);
 693         LIST_HEAD(moved);
 694         LIST_HEAD(failed);
 695         int err = 0;
 696         unsigned long offset = 0;
 697         int nr_pages;
 698         struct page *page;
 699         struct list_head *p;
 700
 701 redo:
 702         nr_pages = 0;
 703         list_for_each(p, pagelist) {
 704                 if (vma) {
 705                         /*
 706                          * The address passed to alloc_page_vma is used to
 707                          * generate the proper interleave behavior. We fake
 708                          * the address here by an increasing offset in order
 709                          * to get the proper distribution of pages.
 710                          *
 711                          * No decision has been made as to which page
 712                          * a certain old page is moved to so we cannot
 713                          * specify the correct address.
 714                          */
 715                         page = alloc_page_vma(GFP_HIGHUSER, vma,
 716                                         offset + vma->vm_start);
 717                         offset += PAGE_SIZE;
 718                 }
 719                 else
 720                         page = alloc_pages_node(dest, GFP_HIGHUSER, 0);
 721
 722                 if (!page) {
 723                         err = -ENOMEM;
 724                         goto out;
 725                 }
 726                 list_add_tail(&page->lru, &newlist);
 727                 nr_pages++;
 728                 if (nr_pages > MIGRATE_CHUNK_SIZE)
 729                         break;
 730         }
 731         err = migrate_pages(pagelist, &newlist, &moved, &failed);
 732
 733         putback_lru_pages(&moved);      /* Call release pages instead ?? */
 734
 735         if (err >= 0 && list_empty(&newlist) && !list_empty(pagelist))
 736                 goto redo;
 737 out:
 738         /* Return leftover allocated pages */
 739         while (!list_empty(&newlist)) {
 740                 page = list_entry(newlist.next, struct page, lru);
 741                 list_del(&page->lru);
 742                 __free_page(page);
 743         }
 744         list_splice(&failed, pagelist);
 745         if (err < 0)
 746                 return err;
 747
 748         /* Calculate number of leftover pages */
 749         nr_pages = 0;
 750         list_for_each(p, pagelist)
 751                 nr_pages++;
 752         return nr_pages;
 753 }