mm/swap_state.c

   1 /*
   2  *  linux/mm/swap_state.c
   3  *
   4  *  Copyright (C) 1991, 1992, 1993, 1994  Linus Torvalds
   5  *  Swap reorganised 29.12.95, Stephen Tweedie
   6  *
   7  *  Rewritten to use page cache, (C) 1998 Stephen Tweedie
   8  */
   9 #include <linux/mm.h>
  10 #include <linux/gfp.h>
  11 #include <linux/kernel_stat.h>
  12 #include <linux/swap.h>
  13 #include <linux/swapops.h>
  14 #include <linux/init.h>
  15 #include <linux/pagemap.h>
  16 #include <linux/backing-dev.h>
  17 #include <linux/blkdev.h>
  18 #include <linux/pagevec.h>
  19 #include <linux/migrate.h>
  20
  21 #include <asm/pgtable.h>
  22 #include "internal.h"
  23
  24 /*
  25  * swapper_space is a fiction, retained to simplify the path through
  26  * vmscan's shrink_page_list.
  27  */
  28 static const struct address_space_operations swap_aops = {
  29         .writepage      = swap_writepage,
  30         .set_page_dirty = swap_set_page_dirty,
  31 #ifdef CONFIG_MIGRATION
  32         .migratepage    = migrate_page,
  33 #endif
  34 };
  35
  36 struct address_space swapper_spaces[MAX_SWAPFILES] = {
  37         [0 ... MAX_SWAPFILES - 1] = {
  38                 .page_tree      = RADIX_TREE_INIT(GFP_ATOMIC|__GFP_NOWARN),
  39                 .i_mmap_writable = ATOMIC_INIT(0),
  40                 .a_ops          = &swap_aops,
  41         }
  42 };
  43
  44 #define INC_CACHE_INFO(x)       do { swap_cache_info.x++; } while (0)
  45
  46 static struct {
  47         unsigned long add_total;
  48         unsigned long del_total;
  49         unsigned long find_success;
  50         unsigned long find_total;
  51 } swap_cache_info;
  52
  53 unsigned long total_swapcache_pages(void)
  54 {
  55         int i;
  56         unsigned long ret = 0;
  57
  58         for (i = 0; i < MAX_SWAPFILES; i++)
  59                 ret += swapper_spaces[i].nrpages;
  60         return ret;
  61 }
  62
  63 static atomic_t swapin_readahead_hits = ATOMIC_INIT(4);
  64
  65 void show_swap_cache_info(void)
  66 {
  67         printk("%lu pages in swap cache\n", total_swapcache_pages());
  68         printk("Swap cache stats: add %lu, delete %lu, find %lu/%lu\n",
  69                 swap_cache_info.add_total, swap_cache_info.del_total,
  70                 swap_cache_info.find_success, swap_cache_info.find_total);
  71         printk("Free swap  = %ldkB\n",
  72                 get_nr_swap_pages() << (PAGE_SHIFT - 10));
  73         printk("Total swap = %lukB\n", total_swap_pages << (PAGE_SHIFT - 10));
  74 }
  75
  76 /*
  77  * __add_to_swap_cache resembles add_to_page_cache_locked on swapper_space,
  78  * but sets SwapCache flag and private instead of mapping and index.
  79  */
  80 int __add_to_swap_cache(struct page *page, swp_entry_t entry)
  81 {
  82         int error;
  83         struct address_space *address_space;
  84
  85         VM_BUG_ON_PAGE(!PageLocked(page), page);
  86         VM_BUG_ON_PAGE(PageSwapCache(page), page);
  87         VM_BUG_ON_PAGE(!PageSwapBacked(page), page);
  88
  89         page_cache_get(page);
  90         SetPageSwapCache(page);
  91         set_page_private(page, entry.val);
  92
  93         address_space = swap_address_space(entry);
  94         spin_lock_irq(&address_space->tree_lock);
  95         error = radix_tree_insert(&address_space->page_tree,
  96                                         entry.val, page);
  97         if (likely(!error)) {
  98                 address_space->nrpages++;
  99                 __inc_zone_page_state(page, NR_FILE_PAGES);
 100                 INC_CACHE_INFO(add_total);
 101         }
 102         spin_unlock_irq(&address_space->tree_lock);
 103
 104         if (unlikely(error)) {
 105                 /*
 106                  * Only the context which have set SWAP_HAS_CACHE flag
 107                  * would call add_to_swap_cache().
 108                  * So add_to_swap_cache() doesn't returns -EEXIST.
 109                  */
 110                 VM_BUG_ON(error == -EEXIST);
 111                 set_page_private(page, 0UL);
 112                 ClearPageSwapCache(page);
 113                 page_cache_release(page);
 114         }
 115
 116         return error;
 117 }
 118
 119
 120 int add_to_swap_cache(struct page *page, swp_entry_t entry, gfp_t gfp_mask)
 121 {
 122         int error;
 123
 124         error = radix_tree_maybe_preload(gfp_mask);
 125         if (!error) {
 126                 error = __add_to_swap_cache(page, entry);
 127                 radix_tree_preload_end();
 128         }
 129         return error;
 130 }
 131
 132 /*
 133  * This must be called only on pages that have
 134  * been verified to be in the swap cache.
 135  */
 136 void __delete_from_swap_cache(struct page *page)
 137 {
 138         swp_entry_t entry;
 139         struct address_space *address_space;
 140
 141         VM_BUG_ON_PAGE(!PageLocked(page), page);
 142         VM_BUG_ON_PAGE(!PageSwapCache(page), page);
 143         VM_BUG_ON_PAGE(PageWriteback(page), page);
 144
 145         entry.val = page_private(page);
 146         address_space = swap_address_space(entry);
 147         radix_tree_delete(&address_space->page_tree, page_private(page));
 148         set_page_private(page, 0);
 149         ClearPageSwapCache(page);
 150         address_space->nrpages--;
 151         __dec_zone_page_state(page, NR_FILE_PAGES);
 152         INC_CACHE_INFO(del_total);
 153 }
 154
 155 /**
 156  * add_to_swap - allocate swap space for a page
 157  * @page: page we want to move to swap
 158  *
 159  * Allocate swap space for the page and add the page to the
 160  * swap cache.  Caller needs to hold the page lock.
 161  */
 162 int add_to_swap(struct page *page, struct list_head *list)
 163 {
 164         swp_entry_t entry;
 165         int err;
 166
 167         VM_BUG_ON_PAGE(!PageLocked(page), page);
 168         VM_BUG_ON_PAGE(!PageUptodate(page), page);
 169
 170         entry = get_swap_page();
 171         if (!entry.val)
 172                 return 0;
 173
 174         if (unlikely(PageTransHuge(page)))
 175                 if (unlikely(split_huge_page_to_list(page, list))) {
 176                         swapcache_free(entry);
 177                         return 0;
 178                 }
 179
 180         /*
 181          * Radix-tree node allocations from PF_MEMALLOC contexts could
 182          * completely exhaust the page allocator. __GFP_NOMEMALLOC
 183          * stops emergency reserves from being allocated.
 184          *
 185          * TODO: this could cause a theoretical memory reclaim
 186          * deadlock in the swap out path.
 187          */
 188         /*
 189          * Add it to the swap cache and mark it dirty
 190          */
 191         err = add_to_swap_cache(page, entry,
 192                         __GFP_HIGH|__GFP_NOMEMALLOC|__GFP_NOWARN);
 193
 194         if (!err) {     /* Success */
 195                 SetPageDirty(page);
 196                 return 1;
 197         } else {        /* -ENOMEM radix-tree allocation failure */
 198                 /*
 199                  * add_to_swap_cache() doesn't return -EEXIST, so we can safely
 200                  * clear SWAP_HAS_CACHE flag.
 201                  */
 202                 swapcache_free(entry);
 203                 return 0;
 204         }
 205 }
 206
 207 /*
 208  * This must be called only on pages that have
 209  * been verified to be in the swap cache and locked.
 210  * It will never put the page into the free list,
 211  * the caller has a reference on the page.
 212  */
 213 void delete_from_swap_cache(struct page *page)
 214 {
 215         swp_entry_t entry;
 216         struct address_space *address_space;
 217
 218         entry.val = page_private(page);
 219
 220         address_space = swap_address_space(entry);
 221         spin_lock_irq(&address_space->tree_lock);
 222         __delete_from_swap_cache(page);
 223         spin_unlock_irq(&address_space->tree_lock);
 224
 225         swapcache_free(entry);
 226         page_cache_release(page);
 227 }
 228
 229 /*
 230  * If we are the only user, then try to free up the swap cache.
 231  *
 232  * Its ok to check for PageSwapCache without the page lock
 233  * here because we are going to recheck again inside
 234  * try_to_free_swap() _with_ the lock.
 235  *                                      - Marcelo
 236  */
 237 static inline void free_swap_cache(struct page *page)
 238 {
 239         if (PageSwapCache(page) && !page_mapped(page) && trylock_page(page)) {
 240                 try_to_free_swap(page);
 241                 unlock_page(page);
 242         }
 243 }
 244
 245 /*
 246  * Perform a free_page(), also freeing any swap cache associated with
 247  * this page if it is the last user of the page.
 248  */
 249 void free_page_and_swap_cache(struct page *page)
 250 {
 251         free_swap_cache(page);
 252         page_cache_release(page);
 253 }
 254
 255 /*
 256  * Passed an array of pages, drop them all from swapcache and then release
 257  * them.  They are removed from the LRU and freed if this is their last use.
 258  */
 259 void free_pages_and_swap_cache(struct page **pages, int nr)
 260 {
 261         struct page **pagep = pages;
 262         int i;
 263
 264         lru_add_drain();
 265         for (i = 0; i < nr; i++)
 266                 free_swap_cache(pagep[i]);
 267         release_pages(pagep, nr, false);
 268 }
 269
 270 /*
 271  * Lookup a swap entry in the swap cache. A found page will be returned
 272  * unlocked and with its refcount incremented - we rely on the kernel
 273  * lock getting page table operations atomic even if we drop the page
 274  * lock before returning.
 275  */
 276 struct page * lookup_swap_cache(swp_entry_t entry)
 277 {
 278         struct page *page;
 279
 280         page = find_get_page(swap_address_space(entry), entry.val);
 281
 282         if (page) {
 283                 INC_CACHE_INFO(find_success);
 284                 if (TestClearPageReadahead(page))
 285                         atomic_inc(&swapin_readahead_hits);
 286         }
 287
 288         INC_CACHE_INFO(find_total);
 289         return page;
 290 }
 291
 292 struct page *__read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask,
 293                         struct vm_area_struct *vma, unsigned long addr,
 294                         bool *new_page_allocated)
 295 {
 296         struct page *found_page, *new_page = NULL;
 297         struct address_space *swapper_space = swap_address_space(entry);
 298         int err;
 299         *new_page_allocated = false;
 300
 301         do {
 302                 /*
 303                  * First check the swap cache.  Since this is normally
 304                  * called after lookup_swap_cache() failed, re-calling
 305                  * that would confuse statistics.
 306                  */
 307                 found_page = find_get_page(swapper_space, entry.val);
 308                 if (found_page)
 309                         break;
 310
 311                 /*
 312                  * Get a new page to read into from swap.
 313                  */
 314                 if (!new_page) {
 315                         new_page = alloc_page_vma(gfp_mask, vma, addr);
 316                         if (!new_page)
 317                                 break;          /* Out of memory */
 318                 }
 319
 320                 /*
 321                  * call radix_tree_preload() while we can wait.
 322                  */
 323                 err = radix_tree_maybe_preload(gfp_mask & GFP_RECLAIM_MASK);
 324                 if (err)
 325                         break;
 326
 327                 /*
 328                  * Swap entry may have been freed since our caller observed it.
 329                  */
 330                 err = swapcache_prepare(entry);
 331                 if (err == -EEXIST) {
 332                         radix_tree_preload_end();
 333                         /*
 334                          * We might race against get_swap_page() and stumble
 335                          * across a SWAP_HAS_CACHE swap_map entry whose page
 336                          * has not been brought into the swapcache yet, while
 337                          * the other end is scheduled away waiting on discard
 338                          * I/O completion at scan_swap_map().
 339                          *
 340                          * In order to avoid turning this transitory state
 341                          * into a permanent loop around this -EEXIST case
 342                          * if !CONFIG_PREEMPT and the I/O completion happens
 343                          * to be waiting on the CPU waitqueue where we are now
 344                          * busy looping, we just conditionally invoke the
 345                          * scheduler here, if there are some more important
 346                          * tasks to run.
 347                          */
 348                         cond_resched();
 349                         continue;
 350                 }
 351                 if (err) {              /* swp entry is obsolete ? */
 352                         radix_tree_preload_end();
 353                         break;
 354                 }
 355
 356                 /* May fail (-ENOMEM) if radix-tree node allocation failed. */
 357                 __set_page_locked(new_page);
 358                 SetPageSwapBacked(new_page);
 359                 err = __add_to_swap_cache(new_page, entry);
 360                 if (likely(!err)) {
 361                         radix_tree_preload_end();
 362                         /*
 363                          * Initiate read into locked page and return.
 364                          */
 365                         lru_cache_add_anon(new_page);
 366                         *new_page_allocated = true;
 367                         return new_page;
 368                 }
 369                 radix_tree_preload_end();
 370                 ClearPageSwapBacked(new_page);
 371                 __clear_page_locked(new_page);
 372                 /*
 373                  * add_to_swap_cache() doesn't return -EEXIST, so we can safely
 374                  * clear SWAP_HAS_CACHE flag.
 375                  */
 376                 swapcache_free(entry);
 377         } while (err != -ENOMEM);
 378
 379         if (new_page)
 380                 page_cache_release(new_page);
 381         return found_page;
 382 }
 383
 384 /*
 385  * Locate a page of swap in physical memory, reserving swap cache space
 386  * and reading the disk if it is not already cached.
 387  * A failure return means that either the page allocation failed or that
 388  * the swap entry is no longer in use.
 389  */
 390 struct page *read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask,
 391                         struct vm_area_struct *vma, unsigned long addr)
 392 {
 393         bool page_was_allocated;
 394         struct page *retpage = __read_swap_cache_async(entry, gfp_mask,
 395                         vma, addr, &page_was_allocated);
 396
 397         if (page_was_allocated)
 398                 swap_readpage(retpage);
 399
 400         return retpage;
 401 }
 402
 403 static unsigned long swapin_nr_pages(unsigned long offset)
 404 {
 405         static unsigned long prev_offset;
 406         unsigned int pages, max_pages, last_ra;
 407         static atomic_t last_readahead_pages;
 408
 409         max_pages = 1 << READ_ONCE(page_cluster);
 410         if (max_pages <= 1)
 411                 return 1;
 412
 413         /*
 414          * This heuristic has been found to work well on both sequential and
 415          * random loads, swapping to hard disk or to SSD: please don't ask
 416          * what the "+ 2" means, it just happens to work well, that's all.
 417          */
 418         pages = atomic_xchg(&swapin_readahead_hits, 0) + 2;
 419         if (pages == 2) {
 420                 /*
 421                  * We can have no readahead hits to judge by: but must not get
 422                  * stuck here forever, so check for an adjacent offset instead
 423                  * (and don't even bother to check whether swap type is same).
 424                  */
 425                 if (offset != prev_offset + 1 && offset != prev_offset - 1)
 426                         pages = 1;
 427                 prev_offset = offset;
 428         } else {
 429                 unsigned int roundup = 4;
 430                 while (roundup < pages)
 431                         roundup <<= 1;
 432                 pages = roundup;
 433         }
 434
 435         if (pages > max_pages)
 436                 pages = max_pages;
 437
 438         /* Don't shrink readahead too fast */
 439         last_ra = atomic_read(&last_readahead_pages) / 2;
 440         if (pages < last_ra)
 441                 pages = last_ra;
 442         atomic_set(&last_readahead_pages, pages);
 443
 444         return pages;
 445 }
 446
 447 /**
 448  * swapin_readahead - swap in pages in hope we need them soon
 449  * @entry: swap entry of this memory
 450  * @gfp_mask: memory allocation flags
 451  * @vma: user vma this address belongs to
 452  * @addr: target address for mempolicy
 453  *
 454  * Returns the struct page for entry and addr, after queueing swapin.
 455  *
 456  * Primitive swap readahead code. We simply read an aligned block of
 457  * (1 << page_cluster) entries in the swap area. This method is chosen
 458  * because it doesn't cost us any seek time.  We also make sure to queue
 459  * the 'original' request together with the readahead ones...
 460  *
 461  * This has been extended to use the NUMA policies from the mm triggering
 462  * the readahead.
 463  *
 464  * Caller must hold down_read on the vma->vm_mm if vma is not NULL.
 465  */
 466 struct page *swapin_readahead(swp_entry_t entry, gfp_t gfp_mask,
 467                         struct vm_area_struct *vma, unsigned long addr)
 468 {
 469         struct page *page;
 470         unsigned long entry_offset = swp_offset(entry);
 471         unsigned long offset = entry_offset;
 472         unsigned long start_offset, end_offset;
 473         unsigned long mask;
 474         struct blk_plug plug;
 475
 476         mask = swapin_nr_pages(offset) - 1;
 477         if (!mask)
 478                 goto skip;
 479
 480         /* Read a page_cluster sized and aligned cluster around offset. */
 481         start_offset = offset & ~mask;
 482         end_offset = offset | mask;
 483         if (!start_offset)      /* First page is swap header. */
 484                 start_offset++;
 485
 486         blk_start_plug(&plug);
 487         for (offset = start_offset; offset <= end_offset ; offset++) {
 488                 /* Ok, do the async read-ahead now */
 489                 page = read_swap_cache_async(swp_entry(swp_type(entry), offset),
 490                                                 gfp_mask, vma, addr);
 491                 if (!page)
 492                         continue;
 493                 if (offset != entry_offset)
 494                         SetPageReadahead(page);
 495                 page_cache_release(page);
 496         }
 497         blk_finish_plug(&plug);
 498
 499         lru_add_drain();        /* Push any new pages onto the LRU now */
 500 skip:
 501         return read_swap_cache_async(entry, gfp_mask, vma, addr);
 502 }