i386/linux-2.3.21/mm/vmscan.c

   1 /*
   2  *  linux/mm/vmscan.c
   3  *
   4  *  Copyright (C) 1991, 1992, 1993, 1994  Linus Torvalds
   5  *
   6  *  Swap reorganised 29.12.95, Stephen Tweedie.
   7  *  kswapd added: 7.1.96  sct
   8  *  Removed kswapd_ctl limits, and swap out as many pages as needed
   9  *  to bring the system back to freepages.high: 2.4.97, Rik van Riel.
  10  *  Version: $Id: vmscan.c,v 1.5 1998/02/23 22:14:28 sct Exp $
  11  */
  12
  13 #include <linux/slab.h>
  14 #include <linux/kernel_stat.h>
  15 #include <linux/swap.h>
  16 #include <linux/swapctl.h>
  17 #include <linux/smp_lock.h>
  18 #include <linux/pagemap.h>
  19 #include <linux/init.h>
  20 #include <linux/bigmem.h>
  21
  22 #include <asm/pgtable.h>
  23
  24 /*
  25  * The swap-out functions return 1 if they successfully
  26  * threw something out, and we got a free page. It returns
  27  * zero if it couldn't do anything, and any other value
  28  * indicates it decreased rss, but the page was shared.
  29  *
  30  * NOTE! If it sleeps, it *must* return 1 to make sure we
  31  * don't continue with the swap-out. Otherwise we may be
  32  * using a process that no longer actually exists (it might
  33  * have died while we slept).
  34  */
  35 static int try_to_swap_out(struct vm_area_struct* vma, unsigned long address, pte_t * page_table, int gfp_mask)
  36 {
  37         pte_t pte;
  38         unsigned long entry;
  39         unsigned long page_addr;
  40         struct page * page;
  41
  42         pte = *page_table;
  43         if (!pte_present(pte))
  44                 goto out_failed;
  45         page_addr = pte_page(pte);
  46         if (MAP_NR(page_addr) >= max_mapnr)
  47                 goto out_failed;
  48
  49         page = mem_map + MAP_NR(page_addr);
  50         spin_lock(&vma->vm_mm->page_table_lock);
  51         if (pte_val(pte) != pte_val(*page_table))
  52                 goto out_failed_unlock;
  53
  54         /* Don't look at this pte if it's been accessed recently. */
  55         if (pte_young(pte)) {
  56                 /*
  57                  * Transfer the "accessed" bit from the page
  58                  * tables to the global page map.
  59                  */
  60                 set_pte(page_table, pte_mkold(pte));
  61                 set_bit(PG_referenced, &page->flags);
  62                 goto out_failed_unlock;
  63         }
  64
  65         if (PageReserved(page)
  66             || PageLocked(page)
  67             || ((gfp_mask & __GFP_DMA) && !PageDMA(page))
  68             || (!(gfp_mask & __GFP_BIGMEM) && PageBIGMEM(page)))
  69                 goto out_failed_unlock;
  70
  71         /*
  72          * Is the page already in the swap cache? If so, then
  73          * we can just drop our reference to it without doing
  74          * any IO - it's already up-to-date on disk.
  75          *
  76          * Return 0, as we didn't actually free any real
  77          * memory, and we should just continue our scan.
  78          */
  79         if (PageSwapCache(page)) {
  80                 entry = page->offset;
  81                 swap_duplicate(entry);
  82                 set_pte(page_table, __pte(entry));
  83 drop_pte:
  84                 vma->vm_mm->rss--;
  85                 flush_tlb_page(vma, address);
  86                 __free_page(page);
  87                 goto out_failed_unlock;
  88         }
  89
  90         /*
  91          * Is it a clean page? Then it must be recoverable
  92          * by just paging it in again, and we can just drop
  93          * it..
  94          *
  95          * However, this won't actually free any real
  96          * memory, as the page will just be in the page cache
  97          * somewhere, and as such we should just continue
  98          * our scan.
  99          *
 100          * Basically, this just makes it possible for us to do
 101          * some real work in the future in "shrink_mmap()".
 102          */
 103         if (!pte_dirty(pte)) {
 104                 pte_clear(page_table);
 105                 goto drop_pte;
 106         }
 107
 108         /*
 109          * Don't go down into the swap-out stuff if
 110          * we cannot do I/O! Avoid recursing on FS
 111          * locks etc.
 112          */
 113         if (!(gfp_mask & __GFP_IO))
 114                 goto out_failed_unlock;
 115
 116         /*
 117          * Ok, it's really dirty. That means that
 118          * we should either create a new swap cache
 119          * entry for it, or we should write it back
 120          * to its own backing store.
 121          *
 122          * Note that in neither case do we actually
 123          * know that we make a page available, but
 124          * as we potentially sleep we can no longer
 125          * continue scanning, so we migth as well
 126          * assume we free'd something.
 127          *
 128          * NOTE NOTE NOTE! This should just set a
 129          * dirty bit in 'page', and just drop the
 130          * pte. All the hard work would be done by
 131          * shrink_mmap().
 132          *
 133          * That would get rid of a lot of problems.
 134          */
 135         flush_cache_page(vma, address);
 136         if (vma->vm_ops && vma->vm_ops->swapout) {
 137                 int error;
 138                 pte_clear(page_table);
 139                 spin_unlock(&vma->vm_mm->page_table_lock);
 140                 flush_tlb_page(vma, address);
 141                 vma->vm_mm->rss--;
 142                 error = vma->vm_ops->swapout(vma, page);
 143                 if (!error)
 144                         goto out_free_success;
 145                 __free_page(page);
 146                 return error;
 147         }
 148
 149         /*
 150          * This is a dirty, swappable page.  First of all,
 151          * get a suitable swap entry for it, and make sure
 152          * we have the swap cache set up to associate the
 153          * page with that swap entry.
 154          */
 155         entry = acquire_swap_entry(page);
 156         if (!entry)
 157                 goto out_failed_unlock; /* No swap space left */
 158
 159         if (!(page = prepare_bigmem_swapout(page)))
 160                 goto out_swap_free_unlock;
 161
 162         vma->vm_mm->rss--;
 163         set_pte(page_table, __pte(entry));
 164         spin_unlock(&vma->vm_mm->page_table_lock);
 165
 166         flush_tlb_page(vma, address);
 167         swap_duplicate(entry);  /* One for the process, one for the swap cache */
 168
 169         /* This will also lock the page */
 170         add_to_swap_cache(page, entry);
 171
 172         /* OK, do a physical asynchronous write to swap.  */
 173         rw_swap_page(WRITE, page, 0);
 174
 175 out_free_success:
 176         __free_page(page);
 177         return 1;
 178 out_failed_unlock:
 179         spin_unlock(&vma->vm_mm->page_table_lock);
 180 out_failed:
 181         return 0;
 182 out_swap_free_unlock:
 183         swap_free(entry);
 184         spin_unlock(&vma->vm_mm->page_table_lock);
 185         return 0;
 186
 187 }
 188
 189 /*
 190  * A new implementation of swap_out().  We do not swap complete processes,
 191  * but only a small number of blocks, before we continue with the next
 192  * process.  The number of blocks actually swapped is determined on the
 193  * number of page faults, that this process actually had in the last time,
 194  * so we won't swap heavily used processes all the time ...
 195  *
 196  * Note: the priority argument is a hint on much CPU to waste with the
 197  *       swap block search, not a hint, of how much blocks to swap with
 198  *       each process.
 199  *
 200  * (C) 1993 Kai Petzke, wpp@marie.physik.tu-berlin.de
 201  */
 202
 203 static inline int swap_out_pmd(struct vm_area_struct * vma, pmd_t *dir, unsigned long address, unsigned long end, int gfp_mask)
 204 {
 205         pte_t * pte;
 206         unsigned long pmd_end;
 207
 208         if (pmd_none(*dir))
 209                 return 0;
 210         if (pmd_bad(*dir)) {
 211                 printk("swap_out_pmd: bad pmd (%08lx)\n", pmd_val(*dir));
 212                 pmd_clear(dir);
 213                 return 0;
 214         }
 215
 216         pte = pte_offset(dir, address);
 217
 218         pmd_end = (address + PMD_SIZE) & PMD_MASK;
 219         if (end > pmd_end)
 220                 end = pmd_end;
 221
 222         do {
 223                 int result;
 224                 vma->vm_mm->swap_address = address + PAGE_SIZE;
 225                 result = try_to_swap_out(vma, address, pte, gfp_mask);
 226                 if (result)
 227                         return result;
 228                 address += PAGE_SIZE;
 229                 pte++;
 230         } while (address < end);
 231         return 0;
 232 }
 233
 234 static inline int swap_out_pgd(struct vm_area_struct * vma, pgd_t *dir, unsigned long address, unsigned long end, int gfp_mask)
 235 {
 236         pmd_t * pmd;
 237         unsigned long pgd_end;
 238
 239         if (pgd_none(*dir))
 240                 return 0;
 241         if (pgd_bad(*dir)) {
 242                 printk("swap_out_pgd: bad pgd (%08lx)\n", pgd_val(*dir));
 243                 pgd_clear(dir);
 244                 return 0;
 245         }
 246
 247         pmd = pmd_offset(dir, address);
 248
 249         pgd_end = (address + PGDIR_SIZE) & PGDIR_MASK;
 250         if (end > pgd_end)
 251                 end = pgd_end;
 252
 253         do {
 254                 int result = swap_out_pmd(vma, pmd, address, end, gfp_mask);
 255                 if (result)
 256                         return result;
 257                 address = (address + PMD_SIZE) & PMD_MASK;
 258                 pmd++;
 259         } while (address < end);
 260         return 0;
 261 }
 262
 263 static int swap_out_vma(struct vm_area_struct * vma, unsigned long address, int gfp_mask)
 264 {
 265         pgd_t *pgdir;
 266         unsigned long end;
 267
 268         /* Don't swap out areas which are locked down */
 269         if (vma->vm_flags & VM_LOCKED)
 270                 return 0;
 271
 272         pgdir = pgd_offset(vma->vm_mm, address);
 273
 274         end = vma->vm_end;
 275         while (address < end) {
 276                 int result = swap_out_pgd(vma, pgdir, address, end, gfp_mask);
 277                 if (result)
 278                         return result;
 279                 address = (address + PGDIR_SIZE) & PGDIR_MASK;
 280                 pgdir++;
 281         }
 282         return 0;
 283 }
 284
 285 static int swap_out_mm(struct mm_struct * mm, int gfp_mask)
 286 {
 287         unsigned long address;
 288         struct vm_area_struct* vma;
 289
 290         /*
 291          * Go through process' page directory.
 292          */
 293         address = mm->swap_address;
 294
 295         /*
 296          * Find the proper vm-area
 297          */
 298         vma = find_vma(mm, address);
 299         if (vma) {
 300                 if (address < vma->vm_start)
 301                         address = vma->vm_start;
 302
 303                 for (;;) {
 304                         int result = swap_out_vma(vma, address, gfp_mask);
 305                         if (result)
 306                                 return result;
 307                         vma = vma->vm_next;
 308                         if (!vma)
 309                                 break;
 310                         address = vma->vm_start;
 311                 }
 312         }
 313
 314         /* We didn't find anything for the process */
 315         mm->swap_cnt = 0;
 316         mm->swap_address = 0;
 317         return 0;
 318 }
 319
 320 /*
 321  * Select the task with maximal swap_cnt and try to swap out a page.
 322  * N.B. This function returns only 0 or 1.  Return values != 1 from
 323  * the lower level routines result in continued processing.
 324  */
 325 static int swap_out(unsigned int priority, int gfp_mask)
 326 {
 327         struct task_struct * p;
 328         int counter;
 329         int __ret = 0;
 330
 331         lock_kernel();
 332         /*
 333          * We make one or two passes through the task list, indexed by
 334          * assign = {0, 1}:
 335          *   Pass 1: select the swappable task with maximal RSS that has
 336          *         not yet been swapped out.
 337          *   Pass 2: re-assign rss swap_cnt values, then select as above.
 338          *
 339          * With this approach, there's no need to remember the last task
 340          * swapped out.  If the swap-out fails, we clear swap_cnt so the
 341          * task won't be selected again until all others have been tried.
 342          *
 343          * Think of swap_cnt as a "shadow rss" - it tells us which process
 344          * we want to page out (always try largest first).
 345          */
 346         counter = nr_threads / (priority+1);
 347         if (counter < 1)
 348                 counter = 1;
 349         if (counter > nr_threads)
 350                 counter = nr_threads;
 351
 352         for (; counter >= 0; counter--) {
 353                 int assign = 0;
 354                 int max_cnt = 0;
 355                 struct mm_struct *best = NULL;
 356                 int pid = 0;
 357         select:
 358                 read_lock(&tasklist_lock);
 359                 p = init_task.next_task;
 360                 for (; p != &init_task; p = p->next_task) {
 361                         struct mm_struct *mm = p->mm;
 362                         if (!p->swappable || !mm)
 363                                 continue;
 364                         if (mm->rss <= 0)
 365                                 continue;
 366                         /* Refresh swap_cnt? */
 367                         if (assign)
 368                                 mm->swap_cnt = mm->rss;
 369                         if (mm->swap_cnt > max_cnt) {
 370                                 max_cnt = mm->swap_cnt;
 371                                 best = mm;
 372                                 pid = p->pid;
 373                         }
 374                 }
 375                 read_unlock(&tasklist_lock);
 376                 if (!best) {
 377                         if (!assign) {
 378                                 assign = 1;
 379                                 goto select;
 380                         }
 381                         goto out;
 382                 } else {
 383                         int ret;
 384
 385                         atomic_inc(&best->mm_count);
 386                         ret = swap_out_mm(best, gfp_mask);
 387                         mmdrop(best);
 388
 389                         if (!ret)
 390                                 continue;
 391
 392                         if (ret < 0)
 393                                 kill_proc(pid, SIGBUS, 1);
 394                         __ret = 1;
 395                         goto out;
 396                 }
 397         }
 398 out:
 399         unlock_kernel();
 400         return __ret;
 401 }
 402
 403 /*
 404  * We need to make the locks finer granularity, but right
 405  * now we need this so that we can do page allocations
 406  * without holding the kernel lock etc.
 407  *
 408  * We want to try to free "count" pages, and we need to
 409  * cluster them so that we get good swap-out behaviour. See
 410  * the "free_memory()" macro for details.
 411  */
 412 static int do_try_to_free_pages(unsigned int gfp_mask)
 413 {
 414         int priority;
 415         int count = SWAP_CLUSTER_MAX;
 416
 417         /* Always trim SLAB caches when memory gets low. */
 418         kmem_cache_reap(gfp_mask);
 419
 420         priority = 6;
 421         do {
 422                 while (shrink_mmap(priority, gfp_mask)) {
 423                         if (!--count)
 424                                 goto done;
 425                 }
 426
 427                 /* don't be too light against the d/i cache since
 428                    shrink_mmap() almost never fail when there's
 429                    really plenty of memory free. */
 430                 count -= shrink_dcache_memory(priority, gfp_mask);
 431                 count -= shrink_icache_memory(priority, gfp_mask);
 432                 if (count <= 0)
 433                         goto done;
 434
 435                 /* Try to get rid of some shared memory pages.. */
 436                 if (gfp_mask & __GFP_IO) {
 437                         while (shm_swap(priority, gfp_mask)) {
 438                                 if (!--count)
 439                                         goto done;
 440                         }
 441                 }
 442
 443                 /* Then, try to page stuff out.. */
 444                 while (swap_out(priority, gfp_mask)) {
 445                         if (!--count)
 446                                 goto done;
 447                 }
 448         } while (--priority >= 0);
 449 done:
 450
 451         return priority >= 0;
 452 }
 453
 454 static struct task_struct *kswapd_process;
 455
 456 /*
 457  * The background pageout daemon, started as a kernel thread
 458  * from the init process.
 459  *
 460  * This basically executes once a second, trickling out pages
 461  * so that we have _some_ free memory available even if there
 462  * is no other activity that frees anything up. This is needed
 463  * for things like routing etc, where we otherwise might have
 464  * all activity going on in asynchronous contexts that cannot
 465  * page things out.
 466  *
 467  * If there are applications that are active memory-allocators
 468  * (most normal use), this basically shouldn't matter.
 469  */
 470 int kswapd(void *unused)
 471 {
 472         struct task_struct *tsk = current;
 473
 474         kswapd_process = tsk;
 475         tsk->session = 1;
 476         tsk->pgrp = 1;
 477         strcpy(tsk->comm, "kswapd");
 478         sigfillset(&tsk->blocked);
 479
 480         /*
 481          * Tell the memory management that we're a "memory allocator",
 482          * and that if we need more memory we should get access to it
 483          * regardless (see "__get_free_pages()"). "kswapd" should
 484          * never get caught in the normal page freeing logic.
 485          *
 486          * (Kswapd normally doesn't need memory anyway, but sometimes
 487          * you need a small amount of memory in order to be able to
 488          * page out something else, and this flag essentially protects
 489          * us from recursively trying to free more memory as we're
 490          * trying to free the first piece of memory in the first place).
 491          */
 492         tsk->flags |= PF_MEMALLOC;
 493
 494         while (1) {
 495                 /*
 496                  * Wake up once a second to see if we need to make
 497                  * more memory available.
 498                  *
 499                  * If we actually get into a low-memory situation,
 500                  * the processes needing more memory will wake us
 501                  * up on a more timely basis.
 502                  */
 503                 do {
 504                         /* kswapd is critical to provide GFP_ATOMIC
 505                            allocations (not GFP_BIGMEM ones). */
 506                         if (nr_free_pages - nr_free_bigpages >= freepages.high)
 507                                 break;
 508
 509                         if (!do_try_to_free_pages(GFP_KSWAPD))
 510                                 break;
 511                         run_task_queue(&tq_disk);
 512                 } while (!tsk->need_resched);
 513                 tsk->state = TASK_INTERRUPTIBLE;
 514                 schedule_timeout(HZ);
 515         }
 516 }
 517
 518 /*
 519  * Called by non-kswapd processes when they want more
 520  * memory.
 521  *
 522  * In a perfect world, this should just wake up kswapd
 523  * and return. We don't actually want to swap stuff out
 524  * from user processes, because the locking issues are
 525  * nasty to the extreme (file write locks, and MM locking)
 526  *
 527  * One option might be to let kswapd do all the page-out
 528  * and VM page table scanning that needs locking, and this
 529  * process thread could do just the mmap shrink stage that
 530  * can be done by just dropping cached pages without having
 531  * any deadlock issues.
 532  */
 533 int try_to_free_pages(unsigned int gfp_mask)
 534 {
 535         int retval = 1;
 536
 537         wake_up_process(kswapd_process);
 538         if (gfp_mask & __GFP_WAIT)
 539                 retval = do_try_to_free_pages(gfp_mask);
 540         return retval;
 541 }
 542
 543 static int __init kswapd_init(void)
 544 {
 545         printk("Starting kswapd v1.6\n");
 546         swap_setup();
 547         kernel_thread(kswapd, NULL, CLONE_FS | CLONE_FILES | CLONE_SIGHAND);
 548         return 0;
 549 }
 550
 551 module_init(kswapd_init)