sys/uvm/uvm_pdaemon.c

   1 /*      $NetBSD: uvm_pdaemon.c,v 1.99 2009/08/18 02:43:49 yamt Exp $    */
   2
   3 /*
   4  * Copyright (c) 1997 Charles D. Cranor and Washington University.
   5  * Copyright (c) 1991, 1993, The Regents of the University of California.
   6  *
   7  * All rights reserved.
   8  *
   9  * This code is derived from software contributed to Berkeley by
  10  * The Mach Operating System project at Carnegie-Mellon University.
  11  *
  12  * Redistribution and use in source and binary forms, with or without
  13  * modification, are permitted provided that the following conditions
  14  * are met:
  15  * 1. Redistributions of source code must retain the above copyright
  16  *    notice, this list of conditions and the following disclaimer.
  17  * 2. Redistributions in binary form must reproduce the above copyright
  18  *    notice, this list of conditions and the following disclaimer in the
  19  *    documentation and/or other materials provided with the distribution.
  20  * 3. All advertising materials mentioning features or use of this software
  21  *    must display the following acknowledgement:
  22  *      This product includes software developed by Charles D. Cranor,
  23  *      Washington University, the University of California, Berkeley and
  24  *      its contributors.
  25  * 4. Neither the name of the University nor the names of its contributors
  26  *    may be used to endorse or promote products derived from this software
  27  *    without specific prior written permission.
  28  *
  29  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  30  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  31  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  32  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  33  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  34  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  35  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  36  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  37  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  38  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  39  * SUCH DAMAGE.
  40  *
  41  *      @(#)vm_pageout.c        8.5 (Berkeley) 2/14/94
  42  * from: Id: uvm_pdaemon.c,v 1.1.2.32 1998/02/06 05:26:30 chs Exp
  43  *
  44  *
  45  * Copyright (c) 1987, 1990 Carnegie-Mellon University.
  46  * All rights reserved.
  47  *
  48  * Permission to use, copy, modify and distribute this software and
  49  * its documentation is hereby granted, provided that both the copyright
  50  * notice and this permission notice appear in all copies of the
  51  * software, derivative works or modified versions, and any portions
  52  * thereof, and that both notices appear in supporting documentation.
  53  *
  54  * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
  55  * CONDITION.  CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
  56  * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
  57  *
  58  * Carnegie Mellon requests users of this software to return to
  59  *
  60  *  Software Distribution Coordinator  or  Software.Distribution@CS.CMU.EDU
  61  *  School of Computer Science
  62  *  Carnegie Mellon University
  63  *  Pittsburgh PA 15213-3890
  64  *
  65  * any improvements or extensions that they make and grant Carnegie the
  66  * rights to redistribute these changes.
  67  */
  68
  69 /*
  70  * uvm_pdaemon.c: the page daemon
  71  */
  72
  73 #include <sys/cdefs.h>
  74 __KERNEL_RCSID(0, "$NetBSD: uvm_pdaemon.c,v 1.99 2009/08/18 02:43:49 yamt Exp $");
  75
  76 #include "opt_uvmhist.h"
  77 #include "opt_readahead.h"
  78
  79 #include <sys/param.h>
  80 #include <sys/proc.h>
  81 #include <sys/systm.h>
  82 #include <sys/kernel.h>
  83 #include <sys/pool.h>
  84 #include <sys/buf.h>
  85 #include <sys/module.h>
  86 #include <sys/atomic.h>
  87
  88 #include <uvm/uvm.h>
  89 #include <uvm/uvm_pdpolicy.h>
  90
  91 /*
  92  * UVMPD_NUMDIRTYREACTS is how many dirty pages the pagedaemon will reactivate
  93  * in a pass thru the inactive list when swap is full.  the value should be
  94  * "small"... if it's too large we'll cycle the active pages thru the inactive
  95  * queue too quickly to for them to be referenced and avoid being freed.
  96  */
  97
  98 #define UVMPD_NUMDIRTYREACTS    16
  99
 100 #define UVMPD_NUMTRYLOCKOWNER   16
 101
 102 /*
 103  * local prototypes
 104  */
 105
 106 static void     uvmpd_scan(void);
 107 static void     uvmpd_scan_queue(void);
 108 static void     uvmpd_tune(void);
 109
 110 unsigned int uvm_pagedaemon_waiters;
 111
 112 /*
 113  * XXX hack to avoid hangs when large processes fork.
 114  */
 115 u_int uvm_extrapages;
 116
 117 static kmutex_t uvm_reclaim_lock;
 118
 119 SLIST_HEAD(uvm_reclaim_hooks, uvm_reclaim_hook) uvm_reclaim_list;
 120
 121 /*
 122  * uvm_wait: wait (sleep) for the page daemon to free some pages
 123  *
 124  * => should be called with all locks released
 125  * => should _not_ be called by the page daemon (to avoid deadlock)
 126  */
 127
 128 void
 129 uvm_wait(const char *wmsg)
 130 {
 131         int timo = 0;
 132
 133         mutex_spin_enter(&uvm_fpageqlock);
 134
 135         /*
 136          * check for page daemon going to sleep (waiting for itself)
 137          */
 138
 139         if (curlwp == uvm.pagedaemon_lwp && uvmexp.paging == 0) {
 140                 /*
 141                  * now we have a problem: the pagedaemon wants to go to
 142                  * sleep until it frees more memory.   but how can it
 143                  * free more memory if it is asleep?  that is a deadlock.
 144                  * we have two options:
 145                  *  [1] panic now
 146                  *  [2] put a timeout on the sleep, thus causing the
 147                  *      pagedaemon to only pause (rather than sleep forever)
 148                  *
 149                  * note that option [2] will only help us if we get lucky
 150                  * and some other process on the system breaks the deadlock
 151                  * by exiting or freeing memory (thus allowing the pagedaemon
 152                  * to continue).  for now we panic if DEBUG is defined,
 153                  * otherwise we hope for the best with option [2] (better
 154                  * yet, this should never happen in the first place!).
 155                  */
 156
 157                 printf("pagedaemon: deadlock detected!\n");
 158                 timo = hz >> 3;         /* set timeout */
 159 #if defined(DEBUG)
 160                 /* DEBUG: panic so we can debug it */
 161                 panic("pagedaemon deadlock");
 162 #endif
 163         }
 164
 165         uvm_pagedaemon_waiters++;
 166         wakeup(&uvm.pagedaemon);                /* wake the daemon! */
 167         UVM_UNLOCK_AND_WAIT(&uvmexp.free, &uvm_fpageqlock, false, wmsg, timo);
 168 }
 169
 170 /*
 171  * uvm_kick_pdaemon: perform checks to determine if we need to
 172  * give the pagedaemon a nudge, and do so if necessary.
 173  *
 174  * => called with uvm_fpageqlock held.
 175  */
 176
 177 void
 178 uvm_kick_pdaemon(void)
 179 {
 180
 181         KASSERT(mutex_owned(&uvm_fpageqlock));
 182
 183         if (uvmexp.free + uvmexp.paging < uvmexp.freemin ||
 184             (uvmexp.free + uvmexp.paging < uvmexp.freetarg &&
 185              uvmpdpol_needsscan_p())) {
 186                 wakeup(&uvm.pagedaemon);
 187         }
 188 }
 189
 190 /*
 191  * uvmpd_tune: tune paging parameters
 192  *
 193  * => called when ever memory is added (or removed?) to the system
 194  * => caller must call with page queues locked
 195  */
 196
 197 static void
 198 uvmpd_tune(void)
 199 {
 200         int val;
 201
 202         UVMHIST_FUNC("uvmpd_tune"); UVMHIST_CALLED(pdhist);
 203
 204         /*
 205          * try to keep 0.5% of available RAM free, but limit to between
 206          * 128k and 1024k per-CPU.  XXX: what are these values good for?
 207          */
 208         val = uvmexp.npages / 200;
 209         val = MAX(val, (128*1024) >> PAGE_SHIFT);
 210         val = MIN(val, (1024*1024) >> PAGE_SHIFT);
 211         val *= ncpu;
 212
 213         /* Make sure there's always a user page free. */
 214         if (val < uvmexp.reserve_kernel + 1)
 215                 val = uvmexp.reserve_kernel + 1;
 216         uvmexp.freemin = val;
 217
 218         /* Calculate free target. */
 219         val = (uvmexp.freemin * 4) / 3;
 220         if (val <= uvmexp.freemin)
 221                 val = uvmexp.freemin + 1;
 222         uvmexp.freetarg = val + atomic_swap_uint(&uvm_extrapages, 0);
 223
 224         uvmexp.wiredmax = uvmexp.npages / 3;
 225         UVMHIST_LOG(pdhist, "<- done, freemin=%d, freetarg=%d, wiredmax=%d",
 226               uvmexp.freemin, uvmexp.freetarg, uvmexp.wiredmax, 0);
 227 }
 228
 229 /*
 230  * uvm_pageout: the main loop for the pagedaemon
 231  */
 232
 233 void
 234 uvm_pageout(void *arg)
 235 {
 236         int bufcnt, npages = 0;
 237         int extrapages = 0;
 238         struct pool *pp;
 239         uint64_t where;
 240         struct uvm_reclaim_hook *hook;
 241
 242         UVMHIST_FUNC("uvm_pageout"); UVMHIST_CALLED(pdhist);
 243
 244         UVMHIST_LOG(pdhist,"<starting uvm pagedaemon>", 0, 0, 0, 0);
 245
 246         /*
 247          * ensure correct priority and set paging parameters...
 248          */
 249
 250         uvm.pagedaemon_lwp = curlwp;
 251         mutex_enter(&uvm_pageqlock);
 252         npages = uvmexp.npages;
 253         uvmpd_tune();
 254         mutex_exit(&uvm_pageqlock);
 255
 256         /*
 257          * main loop
 258          */
 259
 260         for (;;) {
 261                 bool needsscan, needsfree;
 262
 263                 mutex_spin_enter(&uvm_fpageqlock);
 264                 if (uvm_pagedaemon_waiters == 0 || uvmexp.paging > 0) {
 265                         UVMHIST_LOG(pdhist,"  <<SLEEPING>>",0,0,0,0);
 266                         UVM_UNLOCK_AND_WAIT(&uvm.pagedaemon,
 267                             &uvm_fpageqlock, false, "pgdaemon", 0);
 268                         uvmexp.pdwoke++;
 269                         UVMHIST_LOG(pdhist,"  <<WOKE UP>>",0,0,0,0);
 270                 } else {
 271                         mutex_spin_exit(&uvm_fpageqlock);
 272                 }
 273
 274                 /*
 275                  * now lock page queues and recompute inactive count
 276                  */
 277
 278                 mutex_enter(&uvm_pageqlock);
 279                 if (npages != uvmexp.npages || extrapages != uvm_extrapages) {
 280                         npages = uvmexp.npages;
 281                         extrapages = uvm_extrapages;
 282                         mutex_spin_enter(&uvm_fpageqlock);
 283                         uvmpd_tune();
 284                         mutex_spin_exit(&uvm_fpageqlock);
 285                 }
 286
 287                 uvmpdpol_tune();
 288
 289                 /*
 290                  * Estimate a hint.  Note that bufmem are returned to
 291                  * system only when entire pool page is empty.
 292                  */
 293                 mutex_spin_enter(&uvm_fpageqlock);
 294                 bufcnt = uvmexp.freetarg - uvmexp.free;
 295                 if (bufcnt < 0)
 296                         bufcnt = 0;
 297
 298                 UVMHIST_LOG(pdhist,"  free/ftarg=%d/%d",
 299                     uvmexp.free, uvmexp.freetarg, 0,0);
 300
 301                 needsfree = uvmexp.free + uvmexp.paging < uvmexp.freetarg;
 302                 needsscan = needsfree || uvmpdpol_needsscan_p();
 303
 304                 /*
 305                  * scan if needed
 306                  */
 307                 if (needsscan) {
 308                         mutex_spin_exit(&uvm_fpageqlock);
 309                         uvmpd_scan();
 310                         mutex_spin_enter(&uvm_fpageqlock);
 311                 }
 312
 313                 /*
 314                  * if there's any free memory to be had,
 315                  * wake up any waiters.
 316                  */
 317                 if (uvmexp.free > uvmexp.reserve_kernel ||
 318                     uvmexp.paging == 0) {
 319                         wakeup(&uvmexp.free);
 320                         uvm_pagedaemon_waiters = 0;
 321                 }
 322                 mutex_spin_exit(&uvm_fpageqlock);
 323
 324                 /*
 325                  * scan done.  unlock page queues (the only lock we are holding)
 326                  */
 327                 mutex_exit(&uvm_pageqlock);
 328
 329                 /*
 330                  * if we don't need free memory, we're done.
 331                  */
 332
 333                 if (!needsfree)
 334                         continue;
 335
 336                 /*
 337                  * start draining pool resources now that we're not
 338                  * holding any locks.
 339                  */
 340                 pool_drain_start(&pp, &where);
 341
 342                 /*
 343                  * kill unused metadata buffers.
 344                  */
 345                 mutex_enter(&bufcache_lock);
 346                 buf_drain(bufcnt << PAGE_SHIFT);
 347                 mutex_exit(&bufcache_lock);
 348
 349                 mutex_enter(&uvm_reclaim_lock);
 350                 SLIST_FOREACH(hook, &uvm_reclaim_list, uvm_reclaim_next) {
 351                         (*hook->uvm_reclaim_hook)();
 352                 }
 353                 mutex_exit(&uvm_reclaim_lock);
 354
 355                 /*
 356                  * complete draining the pools.
 357                  */
 358                 pool_drain_end(pp, where);
 359         }
 360         /*NOTREACHED*/
 361 }
 362
 363
 364 /*
 365  * uvm_aiodone_worker: a workqueue callback for the aiodone daemon.
 366  */
 367
 368 void
 369 uvm_aiodone_worker(struct work *wk, void *dummy)
 370 {
 371         struct buf *bp = (void *)wk;
 372
 373         KASSERT(&bp->b_work == wk);
 374
 375         /*
 376          * process an i/o that's done.
 377          */
 378
 379         (*bp->b_iodone)(bp);
 380 }
 381
 382 void
 383 uvm_pageout_start(int npages)
 384 {
 385
 386         mutex_spin_enter(&uvm_fpageqlock);
 387         uvmexp.paging += npages;
 388         mutex_spin_exit(&uvm_fpageqlock);
 389 }
 390
 391 void
 392 uvm_pageout_done(int npages)
 393 {
 394
 395         mutex_spin_enter(&uvm_fpageqlock);
 396         KASSERT(uvmexp.paging >= npages);
 397         uvmexp.paging -= npages;
 398
 399         /*
 400          * wake up either of pagedaemon or LWPs waiting for it.
 401          */
 402
 403         if (uvmexp.free <= uvmexp.reserve_kernel) {
 404                 wakeup(&uvm.pagedaemon);
 405         } else {
 406                 wakeup(&uvmexp.free);
 407                 uvm_pagedaemon_waiters = 0;
 408         }
 409         mutex_spin_exit(&uvm_fpageqlock);
 410 }
 411
 412 /*
 413  * uvmpd_trylockowner: trylock the page's owner.
 414  *
 415  * => called with pageq locked.
 416  * => resolve orphaned O->A loaned page.
 417  * => return the locked mutex on success.  otherwise, return NULL.
 418  */
 419
 420 kmutex_t *
 421 uvmpd_trylockowner(struct vm_page *pg)
 422 {
 423         struct uvm_object *uobj = pg->uobject;
 424         kmutex_t *slock;
 425
 426         KASSERT(mutex_owned(&uvm_pageqlock));
 427
 428         if (uobj != NULL) {
 429                 slock = &uobj->vmobjlock;
 430         } else {
 431                 struct vm_anon *anon = pg->uanon;
 432
 433                 KASSERT(anon != NULL);
 434                 slock = &anon->an_lock;
 435         }
 436
 437         if (!mutex_tryenter(slock)) {
 438                 return NULL;
 439         }
 440
 441         if (uobj == NULL) {
 442
 443                 /*
 444                  * set PQ_ANON if it isn't set already.
 445                  */
 446
 447                 if ((pg->pqflags & PQ_ANON) == 0) {
 448                         KASSERT(pg->loan_count > 0);
 449                         pg->loan_count--;
 450                         pg->pqflags |= PQ_ANON;
 451                         /* anon now owns it */
 452                 }
 453         }
 454
 455         return slock;
 456 }
 457
 458 #if defined(VMSWAP)
 459 struct swapcluster {
 460         int swc_slot;
 461         int swc_nallocated;
 462         int swc_nused;
 463         struct vm_page *swc_pages[howmany(MAXPHYS, MIN_PAGE_SIZE)];
 464 };
 465
 466 static void
 467 swapcluster_init(struct swapcluster *swc)
 468 {
 469
 470         swc->swc_slot = 0;
 471         swc->swc_nused = 0;
 472 }
 473
 474 static int
 475 swapcluster_allocslots(struct swapcluster *swc)
 476 {
 477         int slot;
 478         int npages;
 479
 480         if (swc->swc_slot != 0) {
 481                 return 0;
 482         }
 483
 484         /* Even with strange MAXPHYS, the shift
 485            implicitly rounds down to a page. */
 486         npages = MAXPHYS >> PAGE_SHIFT;
 487         slot = uvm_swap_alloc(&npages, true);
 488         if (slot == 0) {
 489                 return ENOMEM;
 490         }
 491         swc->swc_slot = slot;
 492         swc->swc_nallocated = npages;
 493         swc->swc_nused = 0;
 494
 495         return 0;
 496 }
 497
 498 static int
 499 swapcluster_add(struct swapcluster *swc, struct vm_page *pg)
 500 {
 501         int slot;
 502         struct uvm_object *uobj;
 503
 504         KASSERT(swc->swc_slot != 0);
 505         KASSERT(swc->swc_nused < swc->swc_nallocated);
 506         KASSERT((pg->pqflags & PQ_SWAPBACKED) != 0);
 507
 508         slot = swc->swc_slot + swc->swc_nused;
 509         uobj = pg->uobject;
 510         if (uobj == NULL) {
 511                 KASSERT(mutex_owned(&pg->uanon->an_lock));
 512                 pg->uanon->an_swslot = slot;
 513         } else {
 514                 int result;
 515
 516                 KASSERT(mutex_owned(&uobj->vmobjlock));
 517                 result = uao_set_swslot(uobj, pg->offset >> PAGE_SHIFT, slot);
 518                 if (result == -1) {
 519                         return ENOMEM;
 520                 }
 521         }
 522         swc->swc_pages[swc->swc_nused] = pg;
 523         swc->swc_nused++;
 524
 525         return 0;
 526 }
 527
 528 static void
 529 swapcluster_flush(struct swapcluster *swc, bool now)
 530 {
 531         int slot;
 532         int nused;
 533         int nallocated;
 534         int error;
 535
 536         if (swc->swc_slot == 0) {
 537                 return;
 538         }
 539         KASSERT(swc->swc_nused <= swc->swc_nallocated);
 540
 541         slot = swc->swc_slot;
 542         nused = swc->swc_nused;
 543         nallocated = swc->swc_nallocated;
 544
 545         /*
 546          * if this is the final pageout we could have a few
 547          * unused swap blocks.  if so, free them now.
 548          */
 549
 550         if (nused < nallocated) {
 551                 if (!now) {
 552                         return;
 553                 }
 554                 uvm_swap_free(slot + nused, nallocated - nused);
 555         }
 556
 557         /*
 558          * now start the pageout.
 559          */
 560
 561         if (nused > 0) {
 562                 uvmexp.pdpageouts++;
 563                 uvm_pageout_start(nused);
 564                 error = uvm_swap_put(slot, swc->swc_pages, nused, 0);
 565                 KASSERT(error == 0 || error == ENOMEM);
 566         }
 567
 568         /*
 569          * zero swslot to indicate that we are
 570          * no longer building a swap-backed cluster.
 571          */
 572
 573         swc->swc_slot = 0;
 574         swc->swc_nused = 0;
 575 }
 576
 577 static int
 578 swapcluster_nused(struct swapcluster *swc)
 579 {
 580
 581         return swc->swc_nused;
 582 }
 583
 584 /*
 585  * uvmpd_dropswap: free any swap allocated to this page.
 586  *
 587  * => called with owner locked.
 588  * => return true if a page had an associated slot.
 589  */
 590
 591 static bool
 592 uvmpd_dropswap(struct vm_page *pg)
 593 {
 594         bool result = false;
 595         struct vm_anon *anon = pg->uanon;
 596
 597         if ((pg->pqflags & PQ_ANON) && anon->an_swslot) {
 598                 uvm_swap_free(anon->an_swslot, 1);
 599                 anon->an_swslot = 0;
 600                 pg->flags &= ~PG_CLEAN;
 601                 result = true;
 602         } else if (pg->pqflags & PQ_AOBJ) {
 603                 int slot = uao_set_swslot(pg->uobject,
 604                     pg->offset >> PAGE_SHIFT, 0);
 605                 if (slot) {
 606                         uvm_swap_free(slot, 1);
 607                         pg->flags &= ~PG_CLEAN;
 608                         result = true;
 609                 }
 610         }
 611
 612         return result;
 613 }
 614
 615 /*
 616  * uvmpd_trydropswap: try to free any swap allocated to this page.
 617  *
 618  * => return true if a slot is successfully freed.
 619  */
 620
 621 bool
 622 uvmpd_trydropswap(struct vm_page *pg)
 623 {
 624         kmutex_t *slock;
 625         bool result;
 626
 627         if ((pg->flags & PG_BUSY) != 0) {
 628                 return false;
 629         }
 630
 631         /*
 632          * lock the page's owner.
 633          */
 634
 635         slock = uvmpd_trylockowner(pg);
 636         if (slock == NULL) {
 637                 return false;
 638         }
 639
 640         /*
 641          * skip this page if it's busy.
 642          */
 643
 644         if ((pg->flags & PG_BUSY) != 0) {
 645                 mutex_exit(slock);
 646                 return false;
 647         }
 648
 649         result = uvmpd_dropswap(pg);
 650
 651         mutex_exit(slock);
 652
 653         return result;
 654 }
 655
 656 #endif /* defined(VMSWAP) */
 657
 658 /*
 659  * uvmpd_scan_queue: scan an replace candidate list for pages
 660  * to clean or free.
 661  *
 662  * => called with page queues locked
 663  * => we work on meeting our free target by converting inactive pages
 664  *    into free pages.
 665  * => we handle the building of swap-backed clusters
 666  */
 667
 668 static void
 669 uvmpd_scan_queue(void)
 670 {
 671         struct vm_page *p;
 672         struct uvm_object *uobj;
 673         struct vm_anon *anon;
 674 #if defined(VMSWAP)
 675         struct swapcluster swc;
 676 #endif /* defined(VMSWAP) */
 677         int dirtyreacts;
 678         int lockownerfail;
 679         kmutex_t *slock;
 680         UVMHIST_FUNC("uvmpd_scan_queue"); UVMHIST_CALLED(pdhist);
 681
 682         /*
 683          * swslot is non-zero if we are building a swap cluster.  we want
 684          * to stay in the loop while we have a page to scan or we have
 685          * a swap-cluster to build.
 686          */
 687
 688 #if defined(VMSWAP)
 689         swapcluster_init(&swc);
 690 #endif /* defined(VMSWAP) */
 691
 692         dirtyreacts = 0;
 693         lockownerfail = 0;
 694         uvmpdpol_scaninit();
 695
 696         while (/* CONSTCOND */ 1) {
 697
 698                 /*
 699                  * see if we've met the free target.
 700                  */
 701
 702                 if (uvmexp.free + uvmexp.paging
 703 #if defined(VMSWAP)
 704                     + swapcluster_nused(&swc)
 705 #endif /* defined(VMSWAP) */
 706                     >= uvmexp.freetarg << 2 ||
 707                     dirtyreacts == UVMPD_NUMDIRTYREACTS) {
 708                         UVMHIST_LOG(pdhist,"  met free target: "
 709                                     "exit loop", 0, 0, 0, 0);
 710                         break;
 711                 }
 712
 713                 p = uvmpdpol_selectvictim();
 714                 if (p == NULL) {
 715                         break;
 716                 }
 717                 KASSERT(uvmpdpol_pageisqueued_p(p));
 718                 KASSERT(p->wire_count == 0);
 719
 720                 /*
 721                  * we are below target and have a new page to consider.
 722                  */
 723
 724                 anon = p->uanon;
 725                 uobj = p->uobject;
 726
 727                 /*
 728                  * first we attempt to lock the object that this page
 729                  * belongs to.  if our attempt fails we skip on to
 730                  * the next page (no harm done).  it is important to
 731                  * "try" locking the object as we are locking in the
 732                  * wrong order (pageq -> object) and we don't want to
 733                  * deadlock.
 734                  *
 735                  * the only time we expect to see an ownerless page
 736                  * (i.e. a page with no uobject and !PQ_ANON) is if an
 737                  * anon has loaned a page from a uvm_object and the
 738                  * uvm_object has dropped the ownership.  in that
 739                  * case, the anon can "take over" the loaned page
 740                  * and make it its own.
 741                  */
 742
 743                 slock = uvmpd_trylockowner(p);
 744                 if (slock == NULL) {
 745                         /*
 746                          * yield cpu to make a chance for an LWP holding
 747                          * the lock run.  otherwise we can busy-loop too long
 748                          * if the page queue is filled with a lot of pages
 749                          * from few objects.
 750                          */
 751                         lockownerfail++;
 752                         if (lockownerfail > UVMPD_NUMTRYLOCKOWNER) {
 753                                 mutex_exit(&uvm_pageqlock);
 754                                 /* XXX Better than yielding but inadequate. */
 755                                 kpause("livelock", false, 1, NULL);
 756                                 mutex_enter(&uvm_pageqlock);
 757                                 lockownerfail = 0;
 758                         }
 759                         continue;
 760                 }
 761                 if (p->flags & PG_BUSY) {
 762                         mutex_exit(slock);
 763                         uvmexp.pdbusy++;
 764                         continue;
 765                 }
 766
 767                 /* does the page belong to an object? */
 768                 if (uobj != NULL) {
 769                         uvmexp.pdobscan++;
 770                 } else {
 771 #if defined(VMSWAP)
 772                         KASSERT(anon != NULL);
 773                         uvmexp.pdanscan++;
 774 #else /* defined(VMSWAP) */
 775                         panic("%s: anon", __func__);
 776 #endif /* defined(VMSWAP) */
 777                 }
 778
 779
 780                 /*
 781                  * we now have the object and the page queues locked.
 782                  * if the page is not swap-backed, call the object's
 783                  * pager to flush and free the page.
 784                  */
 785
 786 #if defined(READAHEAD_STATS)
 787                 if ((p->pqflags & PQ_READAHEAD) != 0) {
 788                         p->pqflags &= ~PQ_READAHEAD;
 789                         uvm_ra_miss.ev_count++;
 790                 }
 791 #endif /* defined(READAHEAD_STATS) */
 792
 793                 if ((p->pqflags & PQ_SWAPBACKED) == 0) {
 794                         KASSERT(uobj != NULL);
 795                         mutex_exit(&uvm_pageqlock);
 796                         (void) (uobj->pgops->pgo_put)(uobj, p->offset,
 797                             p->offset + PAGE_SIZE, PGO_CLEANIT|PGO_FREE);
 798                         mutex_enter(&uvm_pageqlock);
 799                         continue;
 800                 }
 801
 802                 /*
 803                  * the page is swap-backed.  remove all the permissions
 804                  * from the page so we can sync the modified info
 805                  * without any race conditions.  if the page is clean
 806                  * we can free it now and continue.
 807                  */
 808
 809                 pmap_page_protect(p, VM_PROT_NONE);
 810                 if ((p->flags & PG_CLEAN) && pmap_clear_modify(p)) {
 811                         p->flags &= ~(PG_CLEAN);
 812                 }
 813                 if (p->flags & PG_CLEAN) {
 814                         int slot;
 815                         int pageidx;
 816
 817                         pageidx = p->offset >> PAGE_SHIFT;
 818                         uvm_pagefree(p);
 819                         uvmexp.pdfreed++;
 820
 821                         /*
 822                          * for anons, we need to remove the page
 823                          * from the anon ourselves.  for aobjs,
 824                          * pagefree did that for us.
 825                          */
 826
 827                         if (anon) {
 828                                 KASSERT(anon->an_swslot != 0);
 829                                 anon->an_page = NULL;
 830                                 slot = anon->an_swslot;
 831                         } else {
 832                                 slot = uao_find_swslot(uobj, pageidx);
 833                         }
 834                         mutex_exit(slock);
 835
 836                         if (slot > 0) {
 837                                 /* this page is now only in swap. */
 838                                 mutex_enter(&uvm_swap_data_lock);
 839                                 KASSERT(uvmexp.swpgonly < uvmexp.swpginuse);
 840                                 uvmexp.swpgonly++;
 841                                 mutex_exit(&uvm_swap_data_lock);
 842                         }
 843                         continue;
 844                 }
 845
 846 #if defined(VMSWAP)
 847                 /*
 848                  * this page is dirty, skip it if we'll have met our
 849                  * free target when all the current pageouts complete.
 850                  */
 851
 852                 if (uvmexp.free + uvmexp.paging > uvmexp.freetarg << 2) {
 853                         mutex_exit(slock);
 854                         continue;
 855                 }
 856
 857                 /*
 858                  * free any swap space allocated to the page since
 859                  * we'll have to write it again with its new data.
 860                  */
 861
 862                 uvmpd_dropswap(p);
 863
 864                 /*
 865                  * start new swap pageout cluster (if necessary).
 866                  *
 867                  * if swap is full reactivate this page so that
 868                  * we eventually cycle all pages through the
 869                  * inactive queue.
 870                  */
 871
 872                 if (swapcluster_allocslots(&swc)) {
 873                         dirtyreacts++;
 874                         uvm_pageactivate(p);
 875                         mutex_exit(slock);
 876                         continue;
 877                 }
 878
 879                 /*
 880                  * at this point, we're definitely going reuse this
 881                  * page.  mark the page busy and delayed-free.
 882                  * we should remove the page from the page queues
 883                  * so we don't ever look at it again.
 884                  * adjust counters and such.
 885                  */
 886
 887                 p->flags |= PG_BUSY;
 888                 UVM_PAGE_OWN(p, "scan_queue");
 889
 890                 p->flags |= PG_PAGEOUT;
 891                 uvm_pagedequeue(p);
 892
 893                 uvmexp.pgswapout++;
 894                 mutex_exit(&uvm_pageqlock);
 895
 896                 /*
 897                  * add the new page to the cluster.
 898                  */
 899
 900                 if (swapcluster_add(&swc, p)) {
 901                         p->flags &= ~(PG_BUSY|PG_PAGEOUT);
 902                         UVM_PAGE_OWN(p, NULL);
 903                         mutex_enter(&uvm_pageqlock);
 904                         dirtyreacts++;
 905                         uvm_pageactivate(p);
 906                         mutex_exit(slock);
 907                         continue;
 908                 }
 909                 mutex_exit(slock);
 910
 911                 swapcluster_flush(&swc, false);
 912                 mutex_enter(&uvm_pageqlock);
 913
 914                 /*
 915                  * the pageout is in progress.  bump counters and set up
 916                  * for the next loop.
 917                  */
 918
 919                 uvmexp.pdpending++;
 920
 921 #else /* defined(VMSWAP) */
 922                 uvm_pageactivate(p);
 923                 mutex_exit(slock);
 924 #endif /* defined(VMSWAP) */
 925         }
 926
 927 #if defined(VMSWAP)
 928         mutex_exit(&uvm_pageqlock);
 929         swapcluster_flush(&swc, true);
 930         mutex_enter(&uvm_pageqlock);
 931 #endif /* defined(VMSWAP) */
 932 }
 933
 934 /*
 935  * uvmpd_scan: scan the page queues and attempt to meet our targets.
 936  *
 937  * => called with pageq's locked
 938  */
 939
 940 static void
 941 uvmpd_scan(void)
 942 {
 943         int swap_shortage, pages_freed;
 944         UVMHIST_FUNC("uvmpd_scan"); UVMHIST_CALLED(pdhist);
 945
 946         uvmexp.pdrevs++;
 947
 948         /*
 949          * work on meeting our targets.   first we work on our free target
 950          * by converting inactive pages into free pages.  then we work on
 951          * meeting our inactive target by converting active pages to
 952          * inactive ones.
 953          */
 954
 955         UVMHIST_LOG(pdhist, "  starting 'free' loop",0,0,0,0);
 956
 957         pages_freed = uvmexp.pdfreed;
 958         uvmpd_scan_queue();
 959         pages_freed = uvmexp.pdfreed - pages_freed;
 960
 961         /*
 962          * detect if we're not going to be able to page anything out
 963          * until we free some swap resources from active pages.
 964          */
 965
 966         swap_shortage = 0;
 967         if (uvmexp.free < uvmexp.freetarg &&
 968             uvmexp.swpginuse >= uvmexp.swpgavail &&
 969             !uvm_swapisfull() &&
 970             pages_freed == 0) {
 971                 swap_shortage = uvmexp.freetarg - uvmexp.free;
 972         }
 973
 974         uvmpdpol_balancequeue(swap_shortage);
 975
 976         /*
 977          * if still below the minimum target, try unloading kernel
 978          * modules.
 979          */
 980
 981         if (uvmexp.free < uvmexp.freemin) {
 982                 module_thread_kick();
 983         }
 984 }
 985
 986 /*
 987  * uvm_reclaimable: decide whether to wait for pagedaemon.
 988  *
 989  * => return true if it seems to be worth to do uvm_wait.
 990  *
 991  * XXX should be tunable.
 992  * XXX should consider pools, etc?
 993  */
 994
 995 bool
 996 uvm_reclaimable(void)
 997 {
 998         int filepages;
 999         int active, inactive;
1000
1001         /*
1002          * if swap is not full, no problem.
1003          */
1004
1005         if (!uvm_swapisfull()) {
1006                 return true;
1007         }
1008
1009         /*
1010          * file-backed pages can be reclaimed even when swap is full.
1011          * if we have more than 1/16 of pageable memory or 5MB, try to reclaim.
1012          *
1013          * XXX assume the worst case, ie. all wired pages are file-backed.
1014          *
1015          * XXX should consider about other reclaimable memory.
1016          * XXX ie. pools, traditional buffer cache.
1017          */
1018
1019         filepages = uvmexp.filepages + uvmexp.execpages - uvmexp.wired;
1020         uvm_estimatepageable(&active, &inactive);
1021         if (filepages >= MIN((active + inactive) >> 4,
1022             5 * 1024 * 1024 >> PAGE_SHIFT)) {
1023                 return true;
1024         }
1025
1026         /*
1027          * kill the process, fail allocation, etc..
1028          */
1029
1030         return false;
1031 }
1032
1033 void
1034 uvm_estimatepageable(int *active, int *inactive)
1035 {
1036
1037         uvmpdpol_estimatepageable(active, inactive);
1038 }
1039
1040 void
1041 uvm_reclaim_init(void)
1042 {
1043
1044         /* Initialize UVM reclaim hooks. */
1045         mutex_init(&uvm_reclaim_lock, MUTEX_DEFAULT, IPL_NONE);
1046         SLIST_INIT(&uvm_reclaim_list);
1047 }
1048
1049 void
1050 uvm_reclaim_hook_add(struct uvm_reclaim_hook *hook)
1051 {
1052
1053         KASSERT(hook != NULL);
1054
1055         mutex_enter(&uvm_reclaim_lock);
1056         SLIST_INSERT_HEAD(&uvm_reclaim_list, hook, uvm_reclaim_next);
1057         mutex_exit(&uvm_reclaim_lock);
1058 }
1059
1060 void
1061 uvm_reclaim_hook_del(struct uvm_reclaim_hook *hook_entry)
1062 {
1063         struct uvm_reclaim_hook *hook;
1064
1065         KASSERT(hook_entry != NULL);
1066
1067         mutex_enter(&uvm_reclaim_lock);
1068         SLIST_FOREACH(hook, &uvm_reclaim_list, uvm_reclaim_next) {
1069                 if (hook != hook_entry) {
1070                         continue;
1071                 }
1072
1073                 SLIST_REMOVE(&uvm_reclaim_list, hook, uvm_reclaim_hook,
1074                     uvm_reclaim_next);
1075                 break;
1076         }
1077
1078         mutex_exit(&uvm_reclaim_lock);
1079 }