* better
[mascara-docs.git] / i386 / linux-2.3.21 / mm / vmscan.c
blob1ce37062b2d6798ac3000a04e06dc7a60cd997ac
1 /*
2 * linux/mm/vmscan.c
4 * Copyright (C) 1991, 1992, 1993, 1994 Linus Torvalds
6 * Swap reorganised 29.12.95, Stephen Tweedie.
7 * kswapd added: 7.1.96 sct
8 * Removed kswapd_ctl limits, and swap out as many pages as needed
9 * to bring the system back to freepages.high: 2.4.97, Rik van Riel.
10 * Version: $Id: vmscan.c,v 1.5 1998/02/23 22:14:28 sct Exp $
13 #include <linux/slab.h>
14 #include <linux/kernel_stat.h>
15 #include <linux/swap.h>
16 #include <linux/swapctl.h>
17 #include <linux/smp_lock.h>
18 #include <linux/pagemap.h>
19 #include <linux/init.h>
20 #include <linux/bigmem.h>
22 #include <asm/pgtable.h>
25 * The swap-out functions return 1 if they successfully
26 * threw something out, and we got a free page. It returns
27 * zero if it couldn't do anything, and any other value
28 * indicates it decreased rss, but the page was shared.
30 * NOTE! If it sleeps, it *must* return 1 to make sure we
31 * don't continue with the swap-out. Otherwise we may be
32 * using a process that no longer actually exists (it might
33 * have died while we slept).
35 static int try_to_swap_out(struct vm_area_struct* vma, unsigned long address, pte_t * page_table, int gfp_mask)
37 pte_t pte;
38 unsigned long entry;
39 unsigned long page_addr;
40 struct page * page;
42 pte = *page_table;
43 if (!pte_present(pte))
44 goto out_failed;
45 page_addr = pte_page(pte);
46 if (MAP_NR(page_addr) >= max_mapnr)
47 goto out_failed;
49 page = mem_map + MAP_NR(page_addr);
50 spin_lock(&vma->vm_mm->page_table_lock);
51 if (pte_val(pte) != pte_val(*page_table))
52 goto out_failed_unlock;
54 /* Don't look at this pte if it's been accessed recently. */
55 if (pte_young(pte)) {
57 * Transfer the "accessed" bit from the page
58 * tables to the global page map.
60 set_pte(page_table, pte_mkold(pte));
61 set_bit(PG_referenced, &page->flags);
62 goto out_failed_unlock;
65 if (PageReserved(page)
66 || PageLocked(page)
67 || ((gfp_mask & __GFP_DMA) && !PageDMA(page))
68 || (!(gfp_mask & __GFP_BIGMEM) && PageBIGMEM(page)))
69 goto out_failed_unlock;
72 * Is the page already in the swap cache? If so, then
73 * we can just drop our reference to it without doing
74 * any IO - it's already up-to-date on disk.
76 * Return 0, as we didn't actually free any real
77 * memory, and we should just continue our scan.
79 if (PageSwapCache(page)) {
80 entry = page->offset;
81 swap_duplicate(entry);
82 set_pte(page_table, __pte(entry));
83 drop_pte:
84 vma->vm_mm->rss--;
85 flush_tlb_page(vma, address);
86 __free_page(page);
87 goto out_failed_unlock;
91 * Is it a clean page? Then it must be recoverable
92 * by just paging it in again, and we can just drop
93 * it..
95 * However, this won't actually free any real
96 * memory, as the page will just be in the page cache
97 * somewhere, and as such we should just continue
98 * our scan.
100 * Basically, this just makes it possible for us to do
101 * some real work in the future in "shrink_mmap()".
103 if (!pte_dirty(pte)) {
104 pte_clear(page_table);
105 goto drop_pte;
109 * Don't go down into the swap-out stuff if
110 * we cannot do I/O! Avoid recursing on FS
111 * locks etc.
113 if (!(gfp_mask & __GFP_IO))
114 goto out_failed_unlock;
117 * Ok, it's really dirty. That means that
118 * we should either create a new swap cache
119 * entry for it, or we should write it back
120 * to its own backing store.
122 * Note that in neither case do we actually
123 * know that we make a page available, but
124 * as we potentially sleep we can no longer
125 * continue scanning, so we migth as well
126 * assume we free'd something.
128 * NOTE NOTE NOTE! This should just set a
129 * dirty bit in 'page', and just drop the
130 * pte. All the hard work would be done by
131 * shrink_mmap().
133 * That would get rid of a lot of problems.
135 flush_cache_page(vma, address);
136 if (vma->vm_ops && vma->vm_ops->swapout) {
137 int error;
138 pte_clear(page_table);
139 spin_unlock(&vma->vm_mm->page_table_lock);
140 flush_tlb_page(vma, address);
141 vma->vm_mm->rss--;
142 error = vma->vm_ops->swapout(vma, page);
143 if (!error)
144 goto out_free_success;
145 __free_page(page);
146 return error;
150 * This is a dirty, swappable page. First of all,
151 * get a suitable swap entry for it, and make sure
152 * we have the swap cache set up to associate the
153 * page with that swap entry.
155 entry = acquire_swap_entry(page);
156 if (!entry)
157 goto out_failed_unlock; /* No swap space left */
159 if (!(page = prepare_bigmem_swapout(page)))
160 goto out_swap_free_unlock;
162 vma->vm_mm->rss--;
163 set_pte(page_table, __pte(entry));
164 spin_unlock(&vma->vm_mm->page_table_lock);
166 flush_tlb_page(vma, address);
167 swap_duplicate(entry); /* One for the process, one for the swap cache */
169 /* This will also lock the page */
170 add_to_swap_cache(page, entry);
172 /* OK, do a physical asynchronous write to swap. */
173 rw_swap_page(WRITE, page, 0);
175 out_free_success:
176 __free_page(page);
177 return 1;
178 out_failed_unlock:
179 spin_unlock(&vma->vm_mm->page_table_lock);
180 out_failed:
181 return 0;
182 out_swap_free_unlock:
183 swap_free(entry);
184 spin_unlock(&vma->vm_mm->page_table_lock);
185 return 0;
190 * A new implementation of swap_out(). We do not swap complete processes,
191 * but only a small number of blocks, before we continue with the next
192 * process. The number of blocks actually swapped is determined on the
193 * number of page faults, that this process actually had in the last time,
194 * so we won't swap heavily used processes all the time ...
196 * Note: the priority argument is a hint on much CPU to waste with the
197 * swap block search, not a hint, of how much blocks to swap with
198 * each process.
200 * (C) 1993 Kai Petzke, wpp@marie.physik.tu-berlin.de
203 static inline int swap_out_pmd(struct vm_area_struct * vma, pmd_t *dir, unsigned long address, unsigned long end, int gfp_mask)
205 pte_t * pte;
206 unsigned long pmd_end;
208 if (pmd_none(*dir))
209 return 0;
210 if (pmd_bad(*dir)) {
211 printk("swap_out_pmd: bad pmd (%08lx)\n", pmd_val(*dir));
212 pmd_clear(dir);
213 return 0;
216 pte = pte_offset(dir, address);
218 pmd_end = (address + PMD_SIZE) & PMD_MASK;
219 if (end > pmd_end)
220 end = pmd_end;
222 do {
223 int result;
224 vma->vm_mm->swap_address = address + PAGE_SIZE;
225 result = try_to_swap_out(vma, address, pte, gfp_mask);
226 if (result)
227 return result;
228 address += PAGE_SIZE;
229 pte++;
230 } while (address < end);
231 return 0;
234 static inline int swap_out_pgd(struct vm_area_struct * vma, pgd_t *dir, unsigned long address, unsigned long end, int gfp_mask)
236 pmd_t * pmd;
237 unsigned long pgd_end;
239 if (pgd_none(*dir))
240 return 0;
241 if (pgd_bad(*dir)) {
242 printk("swap_out_pgd: bad pgd (%08lx)\n", pgd_val(*dir));
243 pgd_clear(dir);
244 return 0;
247 pmd = pmd_offset(dir, address);
249 pgd_end = (address + PGDIR_SIZE) & PGDIR_MASK;
250 if (end > pgd_end)
251 end = pgd_end;
253 do {
254 int result = swap_out_pmd(vma, pmd, address, end, gfp_mask);
255 if (result)
256 return result;
257 address = (address + PMD_SIZE) & PMD_MASK;
258 pmd++;
259 } while (address < end);
260 return 0;
263 static int swap_out_vma(struct vm_area_struct * vma, unsigned long address, int gfp_mask)
265 pgd_t *pgdir;
266 unsigned long end;
268 /* Don't swap out areas which are locked down */
269 if (vma->vm_flags & VM_LOCKED)
270 return 0;
272 pgdir = pgd_offset(vma->vm_mm, address);
274 end = vma->vm_end;
275 while (address < end) {
276 int result = swap_out_pgd(vma, pgdir, address, end, gfp_mask);
277 if (result)
278 return result;
279 address = (address + PGDIR_SIZE) & PGDIR_MASK;
280 pgdir++;
282 return 0;
285 static int swap_out_mm(struct mm_struct * mm, int gfp_mask)
287 unsigned long address;
288 struct vm_area_struct* vma;
291 * Go through process' page directory.
293 address = mm->swap_address;
296 * Find the proper vm-area
298 vma = find_vma(mm, address);
299 if (vma) {
300 if (address < vma->vm_start)
301 address = vma->vm_start;
303 for (;;) {
304 int result = swap_out_vma(vma, address, gfp_mask);
305 if (result)
306 return result;
307 vma = vma->vm_next;
308 if (!vma)
309 break;
310 address = vma->vm_start;
314 /* We didn't find anything for the process */
315 mm->swap_cnt = 0;
316 mm->swap_address = 0;
317 return 0;
321 * Select the task with maximal swap_cnt and try to swap out a page.
322 * N.B. This function returns only 0 or 1. Return values != 1 from
323 * the lower level routines result in continued processing.
325 static int swap_out(unsigned int priority, int gfp_mask)
327 struct task_struct * p;
328 int counter;
329 int __ret = 0;
331 lock_kernel();
333 * We make one or two passes through the task list, indexed by
334 * assign = {0, 1}:
335 * Pass 1: select the swappable task with maximal RSS that has
336 * not yet been swapped out.
337 * Pass 2: re-assign rss swap_cnt values, then select as above.
339 * With this approach, there's no need to remember the last task
340 * swapped out. If the swap-out fails, we clear swap_cnt so the
341 * task won't be selected again until all others have been tried.
343 * Think of swap_cnt as a "shadow rss" - it tells us which process
344 * we want to page out (always try largest first).
346 counter = nr_threads / (priority+1);
347 if (counter < 1)
348 counter = 1;
349 if (counter > nr_threads)
350 counter = nr_threads;
352 for (; counter >= 0; counter--) {
353 int assign = 0;
354 int max_cnt = 0;
355 struct mm_struct *best = NULL;
356 int pid = 0;
357 select:
358 read_lock(&tasklist_lock);
359 p = init_task.next_task;
360 for (; p != &init_task; p = p->next_task) {
361 struct mm_struct *mm = p->mm;
362 if (!p->swappable || !mm)
363 continue;
364 if (mm->rss <= 0)
365 continue;
366 /* Refresh swap_cnt? */
367 if (assign)
368 mm->swap_cnt = mm->rss;
369 if (mm->swap_cnt > max_cnt) {
370 max_cnt = mm->swap_cnt;
371 best = mm;
372 pid = p->pid;
375 read_unlock(&tasklist_lock);
376 if (!best) {
377 if (!assign) {
378 assign = 1;
379 goto select;
381 goto out;
382 } else {
383 int ret;
385 atomic_inc(&best->mm_count);
386 ret = swap_out_mm(best, gfp_mask);
387 mmdrop(best);
389 if (!ret)
390 continue;
392 if (ret < 0)
393 kill_proc(pid, SIGBUS, 1);
394 __ret = 1;
395 goto out;
398 out:
399 unlock_kernel();
400 return __ret;
404 * We need to make the locks finer granularity, but right
405 * now we need this so that we can do page allocations
406 * without holding the kernel lock etc.
408 * We want to try to free "count" pages, and we need to
409 * cluster them so that we get good swap-out behaviour. See
410 * the "free_memory()" macro for details.
412 static int do_try_to_free_pages(unsigned int gfp_mask)
414 int priority;
415 int count = SWAP_CLUSTER_MAX;
417 /* Always trim SLAB caches when memory gets low. */
418 kmem_cache_reap(gfp_mask);
420 priority = 6;
421 do {
422 while (shrink_mmap(priority, gfp_mask)) {
423 if (!--count)
424 goto done;
427 /* don't be too light against the d/i cache since
428 shrink_mmap() almost never fail when there's
429 really plenty of memory free. */
430 count -= shrink_dcache_memory(priority, gfp_mask);
431 count -= shrink_icache_memory(priority, gfp_mask);
432 if (count <= 0)
433 goto done;
435 /* Try to get rid of some shared memory pages.. */
436 if (gfp_mask & __GFP_IO) {
437 while (shm_swap(priority, gfp_mask)) {
438 if (!--count)
439 goto done;
443 /* Then, try to page stuff out.. */
444 while (swap_out(priority, gfp_mask)) {
445 if (!--count)
446 goto done;
448 } while (--priority >= 0);
449 done:
451 return priority >= 0;
454 static struct task_struct *kswapd_process;
457 * The background pageout daemon, started as a kernel thread
458 * from the init process.
460 * This basically executes once a second, trickling out pages
461 * so that we have _some_ free memory available even if there
462 * is no other activity that frees anything up. This is needed
463 * for things like routing etc, where we otherwise might have
464 * all activity going on in asynchronous contexts that cannot
465 * page things out.
467 * If there are applications that are active memory-allocators
468 * (most normal use), this basically shouldn't matter.
470 int kswapd(void *unused)
472 struct task_struct *tsk = current;
474 kswapd_process = tsk;
475 tsk->session = 1;
476 tsk->pgrp = 1;
477 strcpy(tsk->comm, "kswapd");
478 sigfillset(&tsk->blocked);
481 * Tell the memory management that we're a "memory allocator",
482 * and that if we need more memory we should get access to it
483 * regardless (see "__get_free_pages()"). "kswapd" should
484 * never get caught in the normal page freeing logic.
486 * (Kswapd normally doesn't need memory anyway, but sometimes
487 * you need a small amount of memory in order to be able to
488 * page out something else, and this flag essentially protects
489 * us from recursively trying to free more memory as we're
490 * trying to free the first piece of memory in the first place).
492 tsk->flags |= PF_MEMALLOC;
494 while (1) {
496 * Wake up once a second to see if we need to make
497 * more memory available.
499 * If we actually get into a low-memory situation,
500 * the processes needing more memory will wake us
501 * up on a more timely basis.
503 do {
504 /* kswapd is critical to provide GFP_ATOMIC
505 allocations (not GFP_BIGMEM ones). */
506 if (nr_free_pages - nr_free_bigpages >= freepages.high)
507 break;
509 if (!do_try_to_free_pages(GFP_KSWAPD))
510 break;
511 run_task_queue(&tq_disk);
512 } while (!tsk->need_resched);
513 tsk->state = TASK_INTERRUPTIBLE;
514 schedule_timeout(HZ);
519 * Called by non-kswapd processes when they want more
520 * memory.
522 * In a perfect world, this should just wake up kswapd
523 * and return. We don't actually want to swap stuff out
524 * from user processes, because the locking issues are
525 * nasty to the extreme (file write locks, and MM locking)
527 * One option might be to let kswapd do all the page-out
528 * and VM page table scanning that needs locking, and this
529 * process thread could do just the mmap shrink stage that
530 * can be done by just dropping cached pages without having
531 * any deadlock issues.
533 int try_to_free_pages(unsigned int gfp_mask)
535 int retval = 1;
537 wake_up_process(kswapd_process);
538 if (gfp_mask & __GFP_WAIT)
539 retval = do_try_to_free_pages(gfp_mask);
540 return retval;
543 static int __init kswapd_init(void)
545 printk("Starting kswapd v1.6\n");
546 swap_setup();
547 kernel_thread(kswapd, NULL, CLONE_FS | CLONE_FILES | CLONE_SIGHAND);
548 return 0;
551 module_init(kswapd_init)