kernel/vm/vm_pvn.c

   1 /*
   2  * CDDL HEADER START
   3  *
   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21 /*
  22  * Copyright (c) 1986, 2010, Oracle and/or its affiliates. All rights reserved.
  23  * Copyright 2015 Nexenta Systems, Inc.  All rights reserved.
  24  */
  25
  26 /*      Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */
  27 /*        All Rights Reserved   */
  28
  29 /*
  30  * University Copyright- Copyright (c) 1982, 1986, 1988
  31  * The Regents of the University of California
  32  * All Rights Reserved
  33  *
  34  * University Acknowledgment- Portions of this document are derived from
  35  * software developed by the University of California, Berkeley, and its
  36  * contributors.
  37  */
  38
  39 /*
  40  * VM - paged vnode.
  41  *
  42  * This file supplies vm support for the vnode operations that deal with pages.
  43  */
  44 #include <sys/types.h>
  45 #include <sys/t_lock.h>
  46 #include <sys/param.h>
  47 #include <sys/sysmacros.h>
  48 #include <sys/systm.h>
  49 #include <sys/time.h>
  50 #include <sys/buf.h>
  51 #include <sys/vnode.h>
  52 #include <sys/uio.h>
  53 #include <sys/vmsystm.h>
  54 #include <sys/mman.h>
  55 #include <sys/vfs.h>
  56 #include <sys/cred.h>
  57 #include <sys/user.h>
  58 #include <sys/kmem.h>
  59 #include <sys/cmn_err.h>
  60 #include <sys/debug.h>
  61 #include <sys/cpuvar.h>
  62 #include <sys/vtrace.h>
  63 #include <sys/tnf_probe.h>
  64
  65 #include <vm/hat.h>
  66 #include <vm/as.h>
  67 #include <vm/seg.h>
  68 #include <vm/rm.h>
  69 #include <vm/pvn.h>
  70 #include <vm/page.h>
  71 #include <vm/seg_map.h>
  72 #include <vm/seg_kmem.h>
  73 #include <sys/fs/swapnode.h>
  74
  75 int pvn_nofodklust = 0;
  76 int pvn_write_noklust = 0;
  77
  78 static struct kmem_cache *marker_cache = NULL;
  79
  80 /*
  81  * Find the largest contiguous block which contains `addr' for file offset
  82  * `offset' in it while living within the file system block sizes (`vp_off'
  83  * and `vp_len') and the address space limits for which no pages currently
  84  * exist and which map to consecutive file offsets.
  85  */
  86 page_t *
  87 pvn_read_kluster(
  88         struct vnode *vp,
  89         uoff_t off,
  90         struct seg *seg,
  91         caddr_t addr,
  92         uoff_t *offp,                   /* return values */
  93         size_t *lenp,                           /* return values */
  94         uoff_t vp_off,
  95         size_t vp_len,
  96         int isra)
  97 {
  98         ssize_t deltaf, deltab;
  99         page_t *pp;
 100         page_t *plist = NULL;
 101         spgcnt_t pagesavail;
 102         uoff_t vp_end;
 103
 104         ASSERT(off >= vp_off && off < vp_off + vp_len);
 105
 106         /*
 107          * We only want to do klustering/read ahead if there
 108          * is more than minfree pages currently available.
 109          */
 110         pagesavail = freemem - minfree;
 111
 112         if (pagesavail <= 0)
 113                 if (isra)
 114                         return (NULL);    /* ra case - give up */
 115                 else
 116                         pagesavail = 1;             /* must return a page */
 117
 118         /* We calculate in pages instead of bytes due to 32-bit overflows */
 119         if (pagesavail < (spgcnt_t)btopr(vp_len)) {
 120                 /*
 121                  * Don't have enough free memory for the
 122                  * max request, try sizing down vp request.
 123                  */
 124                 deltab = (ssize_t)(off - vp_off);
 125                 vp_len -= deltab;
 126                 vp_off += deltab;
 127                 if (pagesavail < btopr(vp_len)) {
 128                         /*
 129                          * Still not enough memory, just settle for
 130                          * pagesavail which is at least 1.
 131                          */
 132                         vp_len = ptob(pagesavail);
 133                 }
 134         }
 135
 136         vp_end = vp_off + vp_len;
 137         ASSERT(off >= vp_off && off < vp_end);
 138
 139         if (isra && segop_kluster(seg, addr, 0))
 140                 return (NULL);  /* segment driver says no */
 141
 142         if ((plist = page_create_va(&vp->v_object, off,
 143             PAGESIZE, PG_EXCL | PG_WAIT, seg, addr)) == NULL)
 144                 return (NULL);
 145
 146         if (vp_len <= PAGESIZE || pvn_nofodklust) {
 147                 *offp = off;
 148                 *lenp = MIN(vp_len, PAGESIZE);
 149         } else {
 150                 /*
 151                  * Scan back from front by incrementing "deltab" and
 152                  * comparing "off" with "vp_off + deltab" to avoid
 153                  * "signed" versus "unsigned" conversion problems.
 154                  */
 155                 for (deltab = PAGESIZE; off >= vp_off + deltab;
 156                     deltab += PAGESIZE) {
 157                         /*
 158                          * Call back to the segment driver to verify that
 159                          * the klustering/read ahead operation makes sense.
 160                          */
 161                         if (segop_kluster(seg, addr, -deltab))
 162                                 break;          /* page not eligible */
 163                         if ((pp = page_create_va(&vp->v_object, off - deltab,
 164                             PAGESIZE, PG_EXCL, seg, addr - deltab))
 165                             == NULL)
 166                                 break;          /* already have the page */
 167                         /*
 168                          * Add page to front of page list.
 169                          */
 170                         page_add(&plist, pp);
 171                 }
 172                 deltab -= PAGESIZE;
 173
 174                 /* scan forward from front */
 175                 for (deltaf = PAGESIZE; off + deltaf < vp_end;
 176                     deltaf += PAGESIZE) {
 177                         /*
 178                          * Call back to the segment driver to verify that
 179                          * the klustering/read ahead operation makes sense.
 180                          */
 181                         if (segop_kluster(seg, addr, deltaf))
 182                                 break;          /* page not file extension */
 183                         if ((pp = page_create_va(&vp->v_object, off + deltaf,
 184                             PAGESIZE, PG_EXCL, seg, addr + deltaf))
 185                             == NULL)
 186                                 break;          /* already have page */
 187
 188                         /*
 189                          * Add page to end of page list.
 190                          */
 191                         page_add(&plist, pp);
 192                         plist = plist->p_next;
 193                 }
 194                 *offp = off = off - deltab;
 195                 *lenp = deltab + deltaf;
 196                 ASSERT(off >= vp_off);
 197
 198                 /*
 199                  * If we ended up getting more than was actually
 200                  * requested, retract the returned length to only
 201                  * reflect what was requested.  This might happen
 202                  * if we were allowed to kluster pages across a
 203                  * span of (say) 5 frags, and frag size is less
 204                  * than PAGESIZE.  We need a whole number of
 205                  * pages to contain those frags, but the returned
 206                  * size should only allow the returned range to
 207                  * extend as far as the end of the frags.
 208                  */
 209                 if ((vp_off + vp_len) < (off + *lenp)) {
 210                         ASSERT(vp_end > off);
 211                         *lenp = vp_end - off;
 212                 }
 213         }
 214         return (plist);
 215 }
 216
 217 /*
 218  * Handle pages for this vnode on either side of the page "pp"
 219  * which has been locked by the caller.  This routine will also
 220  * do klustering in the range [vp_off, vp_off + vp_len] up
 221  * until a page which is not found.  The offset and length
 222  * of pages included is returned in "*offp" and "*lenp".
 223  *
 224  * Returns a list of dirty locked pages all ready to be
 225  * written back.
 226  */
 227 page_t *
 228 pvn_write_kluster(
 229         struct vnode *vp,
 230         page_t *pp,
 231         uoff_t *offp,           /* return values */
 232         size_t *lenp,                   /* return values */
 233         uoff_t vp_off,
 234         size_t vp_len,
 235         int flags)
 236 {
 237         uoff_t off;
 238         page_t *dirty;
 239         size_t deltab, deltaf;
 240         se_t se;
 241         uoff_t vp_end;
 242
 243         off = pp->p_offset;
 244
 245         /*
 246          * Kustering should not be done if we are invalidating
 247          * pages since we could destroy pages that belong to
 248          * some other process if this is a swap vnode.
 249          */
 250         if (pvn_write_noklust || ((flags & B_INVAL) && IS_SWAPVP(vp))) {
 251                 *offp = off;
 252                 *lenp = PAGESIZE;
 253                 return (pp);
 254         }
 255
 256         if (flags & (B_FREE | B_INVAL))
 257                 se = SE_EXCL;
 258         else
 259                 se = SE_SHARED;
 260
 261         dirty = pp;
 262         /*
 263          * Scan backwards looking for pages to kluster by incrementing
 264          * "deltab" and comparing "off" with "vp_off + deltab" to
 265          * avoid "signed" versus "unsigned" conversion problems.
 266          */
 267         for (deltab = PAGESIZE; off >= vp_off + deltab; deltab += PAGESIZE) {
 268                 pp = page_lookup_nowait(&vp->v_object, off - deltab, se);
 269                 if (pp == NULL)
 270                         break;          /* page not found */
 271                 if (pvn_getdirty(pp, flags | B_DELWRI) == 0)
 272                         break;
 273                 page_add(&dirty, pp);
 274         }
 275         deltab -= PAGESIZE;
 276
 277         vp_end = vp_off + vp_len;
 278         /* now scan forwards looking for pages to kluster */
 279         for (deltaf = PAGESIZE; off + deltaf < vp_end; deltaf += PAGESIZE) {
 280                 pp = page_lookup_nowait(&vp->v_object, off + deltaf, se);
 281                 if (pp == NULL)
 282                         break;          /* page not found */
 283                 if (pvn_getdirty(pp, flags | B_DELWRI) == 0)
 284                         break;
 285                 page_add(&dirty, pp);
 286                 dirty = dirty->p_next;
 287         }
 288
 289         *offp = off - deltab;
 290         *lenp = deltab + deltaf;
 291         return (dirty);
 292 }
 293
 294 /*
 295  * Generic entry point used to release the "shared/exclusive" lock
 296  * and the "p_iolock" on pages after i/o is complete.
 297  */
 298 void
 299 pvn_io_done(page_t *plist)
 300 {
 301         page_t *pp;
 302
 303         while (plist != NULL) {
 304                 pp = plist;
 305                 page_sub(&plist, pp);
 306                 page_io_unlock(pp);
 307                 page_unlock(pp);
 308         }
 309 }
 310
 311 /*
 312  * Entry point to be used by file system getpage subr's and
 313  * other such routines which either want to unlock pages (B_ASYNC
 314  * request) or destroy a list of pages if an error occurred.
 315  */
 316 void
 317 pvn_read_done(page_t *plist, int flags)
 318 {
 319         page_t *pp;
 320
 321         while (plist != NULL) {
 322                 pp = plist;
 323                 page_sub(&plist, pp);
 324                 page_io_unlock(pp);
 325                 if (flags & B_ERROR) {
 326                         VN_DISPOSE(pp, B_INVAL, 0, kcred);
 327                 } else {
 328                         (void) page_release(pp, 0);
 329                 }
 330         }
 331 }
 332
 333 /*
 334  * Automagic pageout.
 335  * When memory gets tight, start freeing pages popping out of the
 336  * write queue.
 337  */
 338 int     write_free = 1;
 339 pgcnt_t pages_before_pager = 200;       /* LMXXX */
 340
 341 /*
 342  * Routine to be called when page-out's complete.
 343  * The caller, typically fop_putpage, has to explicity call this routine
 344  * after waiting for i/o to complete (biowait) to free the list of
 345  * pages associated with the buffer.  These pages must be locked
 346  * before i/o is initiated.
 347  *
 348  * If a write error occurs, the pages are marked as modified
 349  * so the write will be re-tried later.
 350  */
 351
 352 void
 353 pvn_write_done(page_t *plist, int flags)
 354 {
 355         int dfree = 0;
 356         int pgrec = 0;
 357         int pgout = 0;
 358         int pgpgout = 0;
 359         int anonpgout = 0;
 360         int anonfree = 0;
 361         int fspgout = 0;
 362         int fsfree = 0;
 363         int execpgout = 0;
 364         int execfree = 0;
 365         page_t *pp;
 366         struct cpu *cpup;
 367         struct vnode *vp = NULL;        /* for probe */
 368         uint_t ppattr;
 369
 370         ASSERT((flags & B_READ) == 0);
 371
 372         /*
 373          * If we are about to start paging anyway, start freeing pages.
 374          */
 375         if (write_free && freemem < lotsfree + pages_before_pager &&
 376             (flags & B_ERROR) == 0) {
 377                 flags |= B_FREE;
 378         }
 379
 380         /*
 381          * Handle each page involved in the i/o operation.
 382          */
 383         while (plist != NULL) {
 384                 pp = plist;
 385                 ASSERT(PAGE_LOCKED(pp) && page_iolock_assert(pp));
 386                 page_sub(&plist, pp);
 387
 388                 /* Kernel probe support */
 389                 if (vp == NULL)
 390                         vp = pp->p_vnode;
 391
 392                 if (((flags & B_ERROR) == 0) && IS_VMODSORT(vp)) {
 393                         /*
 394                          * Move page to the top of the v_page list.
 395                          * Skip pages modified during IO.
 396                          */
 397                         vmobject_lock(&vp->v_object);
 398                         if (!hat_ismod(pp))
 399                                 vmobject_move_page_tail(&vp->v_object, pp);
 400                         vmobject_unlock(&vp->v_object);
 401                 }
 402
 403                 if (flags & B_ERROR) {
 404                         /*
 405                          * Write operation failed.  We don't want
 406                          * to destroy (or free) the page unless B_FORCE
 407                          * is set. We set the mod bit again and release
 408                          * all locks on the page so that it will get written
 409                          * back again later when things are hopefully
 410                          * better again.
 411                          * If B_INVAL and B_FORCE is set we really have
 412                          * to destroy the page.
 413                          */
 414                         if ((flags & (B_INVAL|B_FORCE)) == (B_INVAL|B_FORCE)) {
 415                                 page_io_unlock(pp);
 416                                 VN_DISPOSE(pp, B_INVAL, 0, kcred);
 417                         } else {
 418                                 hat_setmod_only(pp);
 419                                 page_io_unlock(pp);
 420                                 page_unlock(pp);
 421                         }
 422                 } else if (flags & B_INVAL) {
 423                         /*
 424                          * XXX - Failed writes with B_INVAL set are
 425                          * not handled appropriately.
 426                          */
 427                         page_io_unlock(pp);
 428                         VN_DISPOSE(pp, B_INVAL, 0, kcred);
 429                 } else if (flags & B_FREE ||!hat_page_is_mapped(pp)) {
 430                         /*
 431                          * Update statistics for pages being paged out
 432                          */
 433                         if (pp->p_vnode) {
 434                                 if (IS_SWAPFSVP(pp->p_vnode)) {
 435                                         anonpgout++;
 436                                 } else {
 437                                         if (pp->p_vnode->v_flag & VVMEXEC) {
 438                                                 execpgout++;
 439                                         } else {
 440                                                 fspgout++;
 441                                         }
 442                                 }
 443                         }
 444                         page_io_unlock(pp);
 445                         pgout = 1;
 446                         pgpgout++;
 447
 448                         /*
 449                          * The page_struct_lock need not be acquired to
 450                          * examine "p_lckcnt" and "p_cowcnt" since we'll
 451                          * have an "exclusive" lock if the upgrade succeeds.
 452                          */
 453                         if (page_tryupgrade(pp) &&
 454                             pp->p_lckcnt == 0 && pp->p_cowcnt == 0) {
 455                                 /*
 456                                  * Check if someone has reclaimed the
 457                                  * page.  If ref and mod are not set, no
 458                                  * one is using it so we can free it.
 459                                  * The rest of the system is careful
 460                                  * to use the NOSYNC flag to unload
 461                                  * translations set up for i/o w/o
 462                                  * affecting ref and mod bits.
 463                                  *
 464                                  * Obtain a copy of the real hardware
 465                                  * mod bit using hat_pagesync(pp, HAT_DONTZERO)
 466                                  * to avoid having to flush the cache.
 467                                  */
 468                                 ppattr = hat_pagesync(pp, HAT_SYNC_DONTZERO |
 469                                     HAT_SYNC_STOPON_MOD);
 470                         ck_refmod:
 471                                 if (!(ppattr & (P_REF | P_MOD))) {
 472                                         if (hat_page_is_mapped(pp)) {
 473                                                 /*
 474                                                  * Doesn't look like the page
 475                                                  * was modified so now we
 476                                                  * really have to unload the
 477                                                  * translations.  Meanwhile
 478                                                  * another CPU could've
 479                                                  * modified it so we have to
 480                                                  * check again.  We don't loop
 481                                                  * forever here because now
 482                                                  * the translations are gone
 483                                                  * and no one can get a new one
 484                                                  * since we have the "exclusive"
 485                                                  * lock on the page.
 486                                                  */
 487                                                 (void) hat_pageunload(pp,
 488                                                     HAT_FORCE_PGUNLOAD);
 489                                                 ppattr = hat_page_getattr(pp,
 490                                                     P_REF | P_MOD);
 491                                                 goto ck_refmod;
 492                                         }
 493                                         /*
 494                                          * Update statistics for pages being
 495                                          * freed
 496                                          */
 497                                         if (pp->p_vnode) {
 498                                                 if (IS_SWAPFSVP(pp->p_vnode)) {
 499                                                         anonfree++;
 500                                                 } else {
 501                                                         if (pp->p_vnode->v_flag
 502                                                             & VVMEXEC) {
 503                                                                 execfree++;
 504                                                         } else {
 505                                                                 fsfree++;
 506                                                         }
 507                                                 }
 508                                         }
 509
 510                                         VN_DISPOSE(pp, B_FREE,
 511                                             (flags & B_DONTNEED), kcred);
 512                                         dfree++;
 513                                 } else {
 514                                         page_unlock(pp);
 515                                         pgrec++;
 516                                 }
 517                         } else {
 518                                 /*
 519                                  * Page is either `locked' in memory
 520                                  * or was reclaimed and now has a
 521                                  * "shared" lock, so release it.
 522                                  */
 523                                 page_unlock(pp);
 524                         }
 525                 } else {
 526                         /*
 527                          * Neither B_FREE nor B_INVAL nor B_ERROR.
 528                          * Just release locks.
 529                          */
 530                         page_io_unlock(pp);
 531                         page_unlock(pp);
 532                 }
 533         }
 534
 535         CPU_STATS_ENTER_K();
 536         cpup = CPU;             /* get cpup now that CPU cannot change */
 537         CPU_STATS_ADDQ(cpup, vm, dfree, dfree);
 538         CPU_STATS_ADDQ(cpup, vm, pgrec, pgrec);
 539         CPU_STATS_ADDQ(cpup, vm, pgout, pgout);
 540         CPU_STATS_ADDQ(cpup, vm, pgpgout, pgpgout);
 541         CPU_STATS_ADDQ(cpup, vm, anonpgout, anonpgout);
 542         CPU_STATS_ADDQ(cpup, vm, anonfree, anonfree);
 543         CPU_STATS_ADDQ(cpup, vm, fspgout, fspgout);
 544         CPU_STATS_ADDQ(cpup, vm, fsfree, fsfree);
 545         CPU_STATS_ADDQ(cpup, vm, execpgout, execpgout);
 546         CPU_STATS_ADDQ(cpup, vm, execfree, execfree);
 547         CPU_STATS_EXIT_K();
 548 }
 549
 550 /*
 551  * Flags are composed of {B_ASYNC, B_INVAL, B_FREE, B_DONTNEED, B_DELWRI,
 552  * B_TRUNC, B_FORCE}.  B_DELWRI indicates that this page is part of a kluster
 553  * operation and is only to be considered if it doesn't involve any
 554  * waiting here.  B_TRUNC indicates that the file is being truncated
 555  * and so no i/o needs to be done. B_FORCE indicates that the page
 556  * must be destroyed so don't try wrting it out.
 557  *
 558  * The caller must ensure that the page is locked.  Returns 1, if
 559  * the page should be written back (the "iolock" is held in this
 560  * case), or 0 if the page has been dealt with or has been
 561  * unlocked.
 562  */
 563 int
 564 pvn_getdirty(page_t *pp, int flags)
 565 {
 566         ASSERT((flags & (B_INVAL | B_FREE)) ?
 567             PAGE_EXCL(pp) : PAGE_SHARED(pp));
 568         ASSERT(PP_ISFREE(pp) == 0);
 569
 570         /*
 571          * If trying to invalidate or free a logically `locked' page,
 572          * forget it.  Don't need page_struct_lock to check p_lckcnt and
 573          * p_cowcnt as the page is exclusively locked.
 574          */
 575         if ((flags & (B_INVAL | B_FREE)) && !(flags & (B_TRUNC|B_FORCE)) &&
 576             (pp->p_lckcnt != 0 || pp->p_cowcnt != 0)) {
 577                 page_unlock(pp);
 578                 return (0);
 579         }
 580
 581         /*
 582          * Now acquire the i/o lock so we can add it to the dirty
 583          * list (if necessary).  We avoid blocking on the i/o lock
 584          * in the following cases:
 585          *
 586          *      If B_DELWRI is set, which implies that this request is
 587          *      due to a klustering operartion.
 588          *
 589          *      If this is an async (B_ASYNC) operation and we are not doing
 590          *      invalidation (B_INVAL) [The current i/o or fsflush will ensure
 591          *      that the the page is written out].
 592          */
 593         if ((flags & B_DELWRI) || ((flags & (B_INVAL | B_ASYNC)) == B_ASYNC)) {
 594                 if (!page_io_trylock(pp)) {
 595                         page_unlock(pp);
 596                         return (0);
 597                 }
 598         } else {
 599                 page_io_lock(pp);
 600         }
 601
 602         /*
 603          * If we want to free or invalidate the page then
 604          * we need to unload it so that anyone who wants
 605          * it will have to take a minor fault to get it.
 606          * Otherwise, we're just writing the page back so we
 607          * need to sync up the hardwre and software mod bit to
 608          * detect any future modifications.  We clear the
 609          * software mod bit when we put the page on the dirty
 610          * list.
 611          */
 612         if (flags & (B_INVAL | B_FREE)) {
 613                 (void) hat_pageunload(pp, HAT_FORCE_PGUNLOAD);
 614         } else {
 615                 (void) hat_pagesync(pp, HAT_SYNC_ZERORM);
 616         }
 617
 618         if (!hat_ismod(pp) || (flags & B_TRUNC)) {
 619                 /*
 620                  * Don't need to add it to the
 621                  * list after all.
 622                  */
 623                 page_io_unlock(pp);
 624                 if (flags & B_INVAL) {
 625                         VN_DISPOSE(pp, B_INVAL, 0, kcred);
 626                 } else if (flags & B_FREE) {
 627                         VN_DISPOSE(pp, B_FREE, (flags & B_DONTNEED), kcred);
 628                 } else {
 629                         /*
 630                          * This is advisory path for the callers
 631                          * of fop_putpage() who prefer freeing the
 632                          * page _only_ if no one else is accessing it.
 633                          * E.g. segmap_release()
 634                          *
 635                          * The above hat_ismod() check is useless because:
 636                          * (1) we may not be holding SE_EXCL lock;
 637                          * (2) we've not unloaded _all_ translations
 638                          *
 639                          * Let page_release() do the heavy-lifting.
 640                          */
 641                         (void) page_release(pp, 1);
 642                 }
 643                 return (0);
 644         }
 645
 646         /*
 647          * Page is dirty, get it ready for the write back
 648          * and add page to the dirty list.
 649          */
 650         hat_clrrefmod(pp);
 651
 652         /*
 653          * If we're going to free the page when we're done
 654          * then we can let others try to use it starting now.
 655          * We'll detect the fact that they used it when the
 656          * i/o is done and avoid freeing the page.
 657          */
 658         if (flags & B_FREE)
 659                 page_downgrade(pp);
 660
 661         return (1);
 662 }
 663
 664
 665 /*ARGSUSED*/
 666 static int
 667 marker_constructor(void *buf, void *cdrarg, int kmflags)
 668 {
 669         page_t *mark = buf;
 670         bzero(mark, sizeof (page_t));
 671         PP_SETPVN_TAG(mark);
 672         return (0);
 673 }
 674
 675 void
 676 pvn_init()
 677 {
 678         marker_cache = kmem_cache_create("marker_cache",
 679             sizeof (page_t), 0, marker_constructor,
 680             NULL, NULL, NULL, NULL, 0);
 681 }
 682
 683 static inline void
 684 move_marker(struct vnode *vnode, struct page *ref, struct page *mark)
 685 {
 686         list_remove(&vnode->v_object.list, mark);
 687         list_insert_before(&vnode->v_object.list, ref, mark);
 688 }
 689
 690 /*
 691  * Process a vnode's page list for all pages whose offset is >= off.
 692  * Pages are to either be free'd, invalidated, or written back to disk.
 693  *
 694  * An "exclusive" lock is acquired for each page if B_INVAL or B_FREE
 695  * is specified, otherwise they are "shared" locked.
 696  *
 697  * Flags are {B_ASYNC, B_INVAL, B_FREE, B_DONTNEED, B_TRUNC}
 698  *
 699  * Special marker page_t's are inserted in the list in order
 700  * to keep track of where we are in the list when locks are dropped.
 701  *
 702  * Note the list is circular and insertions can happen only at the
 703  * head and tail of the list. The algorithm ensures visiting all pages
 704  * on the list in the following way:
 705  *
 706  *    Drop two marker pages at the end of the list.
 707  *
 708  *    Move one marker page backwards towards the start of the list until
 709  *    it is at the list head, processing the pages passed along the way.
 710  *
 711  *    Due to race conditions when the vnode page mutex is dropped,
 712  *    additional pages can be added to either end of the list, so we'll
 713  *    continue to move the marker and process pages until it is up against
 714  *    the end marker.
 715  *
 716  * There is one special exit condition. If we are processing a VMODSORT
 717  * vnode and only writing back modified pages, we can stop as soon as
 718  * we run into an unmodified page.  This makes fsync(3) operations fast.
 719  */
 720 int
 721 pvn_vplist_dirty(
 722         vnode_t         *vp,
 723         uoff_t  off,
 724         int             (*putapage)(vnode_t *, page_t *, uoff_t *,
 725                         size_t *, int, cred_t *),
 726         int             flags,
 727         cred_t          *cred)
 728 {
 729         page_t          *pp;
 730         page_t          *mark;          /* marker page that moves toward head */
 731         page_t          *end;           /* marker page at end of list */
 732         int             err = 0;
 733         int             error;
 734         se_t            se;
 735
 736         ASSERT(vp->v_type != VCHR);
 737
 738         if (!vn_has_cached_data(vp))
 739                 return (0);
 740
 741
 742         /*
 743          * Serialize vplist_dirty operations on this vnode by setting VVMLOCK.
 744          *
 745          * Don't block on VVMLOCK if B_ASYNC is set. This prevents sync()
 746          * from getting blocked while flushing pages to a dead NFS server.
 747          */
 748         mutex_enter(&vp->v_lock);
 749         if ((vp->v_flag & VVMLOCK) && (flags & B_ASYNC)) {
 750                 mutex_exit(&vp->v_lock);
 751                 return (EAGAIN);
 752         }
 753
 754         while (vp->v_flag & VVMLOCK)
 755                 cv_wait(&vp->v_cv, &vp->v_lock);
 756
 757         if (!vn_has_cached_data(vp)) {
 758                 mutex_exit(&vp->v_lock);
 759                 return (0);
 760         }
 761
 762         vp->v_flag |= VVMLOCK;
 763         mutex_exit(&vp->v_lock);
 764
 765
 766         /*
 767          * Set up the marker pages used to walk the list
 768          */
 769         end = kmem_cache_alloc(marker_cache, KM_SLEEP);
 770         end->p_object = &vp->v_object;
 771         end->p_vnode = vp;
 772         end->p_offset = (uoff_t)-2;
 773         mark = kmem_cache_alloc(marker_cache, KM_SLEEP);
 774         mark->p_object = &vp->v_object;
 775         mark->p_vnode = vp;
 776         mark->p_offset = (uoff_t)-1;
 777
 778         /*
 779          * Grab the lock protecting the vnode's page list
 780          * note that this lock is dropped at times in the loop.
 781          */
 782         vmobject_lock(&vp->v_object);
 783         if (!vn_has_cached_data(vp))
 784                 goto leave;
 785
 786         /*
 787          * insert the markers and loop through the list of pages
 788          */
 789         vmobject_add_page_tail(&vp->v_object, mark);
 790         vmobject_add_page_tail(&vp->v_object, end);
 791
 792         for (;;) {
 793
 794                 /*
 795                  * If only doing an async write back, then we can
 796                  * stop as soon as we get to start of the list.
 797                  */
 798                 if (flags == B_ASYNC && vmobject_get_head(&vp->v_object) == mark)
 799                         break;
 800
 801                 pp = vmobject_get_prev_loop(&vp->v_object, mark);
 802
 803                 /*
 804                  * otherwise stop when we've gone through all the pages
 805                  */
 806                 if (pp == end)
 807                         break;
 808
 809                 VERIFY(pp->p_object == &vp->v_object);
 810                 ASSERT(pp->p_vnode == vp);
 811
 812                 /*
 813                  * If just flushing dirty pages to disk and this vnode
 814                  * is using a sorted list of pages, we can stop processing
 815                  * as soon as we find an unmodified page. Since all the
 816                  * modified pages are visited first.
 817                  */
 818                 if (IS_VMODSORT(vp) &&
 819                     !(flags & (B_INVAL | B_FREE | B_TRUNC))) {
 820                         if (!hat_ismod(pp) && !page_io_locked(pp)) {
 821 #ifdef  DEBUG
 822                                 /*
 823                                  * For debug kernels examine what should be
 824                                  * all the remaining clean pages, asserting
 825                                  * that they are not modified.
 826                                  */
 827                                 page_t  *chk = pp;
 828                                 int     attr;
 829
 830                                 move_marker(vp, pp, mark);
 831
 832                                 do {
 833                                         chk = vmobject_get_prev_loop(&vp->v_object,
 834                                                                      chk);
 835                                         ASSERT(chk != end);
 836                                         if (chk == mark)
 837                                                 continue;
 838                                         attr = hat_page_getattr(chk, P_MOD |
 839                                             P_REF);
 840                                         if ((attr & P_MOD) == 0)
 841                                                 continue;
 842                                         panic("v_object list not all clean: "
 843                                             "page_t*=%p vnode=%p off=%lx "
 844                                             "attr=0x%x last clean page_t*=%p\n",
 845                                             chk, chk->p_vnode,
 846                                             (long)chk->p_offset, attr, pp);
 847                                 } while (chk != vmobject_get_head(&vp->v_object));
 848 #endif
 849                                 break;
 850                         } else if (!(flags & B_ASYNC) && !hat_ismod(pp)) {
 851                                 /*
 852                                  * Couldn't get io lock, wait until IO is done.
 853                                  * Block only for sync IO since we don't want
 854                                  * to block async IO.
 855                                  */
 856                                 vmobject_unlock(&vp->v_object);
 857                                 page_io_wait(pp);
 858                                 vmobject_lock(&vp->v_object);
 859                                 continue;
 860                         }
 861                 }
 862
 863                 /*
 864                  * Skip this page if the offset is out of the desired range.
 865                  * Just move the marker and continue.
 866                  */
 867                 if (pp->p_offset < off) {
 868                         move_marker(vp, pp, mark);
 869                         continue;
 870                 }
 871
 872                 /*
 873                  * If we are supposed to invalidate or free this
 874                  * page, then we need an exclusive lock.
 875                  */
 876                 se = (flags & (B_INVAL | B_FREE)) ? SE_EXCL : SE_SHARED;
 877
 878                 /*
 879                  * We must acquire the page lock for all synchronous
 880                  * operations (invalidate, free and write).
 881                  */
 882                 if ((flags & B_INVAL) != 0 || (flags & B_ASYNC) == 0) {
 883                         /*
 884                          * If the page_lock() drops the mutex
 885                          * we must retry the loop.
 886                          */
 887                         if (!page_lock(pp, se, &vp->v_object, P_NO_RECLAIM))
 888                                 continue;
 889
 890                         /*
 891                          * It's ok to move the marker page now.
 892                          */
 893                         move_marker(vp, pp, mark);
 894                 } else {
 895
 896                         /*
 897                          * update the marker page for all remaining cases
 898                          */
 899                         move_marker(vp, pp, mark);
 900
 901                         /*
 902                          * For write backs, If we can't lock the page, it's
 903                          * invalid or in the process of being destroyed.  Skip
 904                          * it, assuming someone else is writing it.
 905                          */
 906                         if (!page_trylock(pp, se))
 907                                 continue;
 908                 }
 909
 910                 VERIFY(pp->p_object == &vp->v_object);
 911                 ASSERT(pp->p_vnode == vp);
 912
 913                 /*
 914                  * Successfully locked the page, now figure out what to
 915                  * do with it. Free pages are easily dealt with, invalidate
 916                  * if desired or just go on to the next page.
 917                  */
 918                 if (PP_ISFREE(pp)) {
 919                         if ((flags & B_INVAL) == 0) {
 920                                 page_unlock(pp);
 921                                 continue;
 922                         }
 923
 924                         /*
 925                          * Invalidate (destroy) the page.
 926                          */
 927                         vmobject_unlock(&vp->v_object);
 928                         page_destroy_free(pp);
 929                         vmobject_lock(&vp->v_object);
 930                         continue;
 931                 }
 932
 933                 /*
 934                  * pvn_getdirty() figures out what do do with a dirty page.
 935                  * If the page is dirty, the putapage() routine will write it
 936                  * and will kluster any other adjacent dirty pages it can.
 937                  *
 938                  * pvn_getdirty() and `(*putapage)' unlock the page.
 939                  */
 940                 vmobject_unlock(&vp->v_object);
 941                 if (pvn_getdirty(pp, flags)) {
 942                         error = (*putapage)(vp, pp, NULL, NULL, flags, cred);
 943                         if (!err)
 944                                 err = error;
 945                 }
 946                 vmobject_lock(&vp->v_object);
 947         }
 948         vmobject_remove_page(&vp->v_object, mark);
 949         vmobject_remove_page(&vp->v_object, end);
 950
 951 leave:
 952         /*
 953          * Release v_object mutex, also VVMLOCK and wakeup blocked
 954          * threads
 955          */
 956         vmobject_unlock(&vp->v_object);
 957         kmem_cache_free(marker_cache, mark);
 958         kmem_cache_free(marker_cache, end);
 959         mutex_enter(&vp->v_lock);
 960         vp->v_flag &= ~VVMLOCK;
 961         cv_broadcast(&vp->v_cv);
 962         mutex_exit(&vp->v_lock);
 963         return (err);
 964 }
 965
 966 /*
 967  * Walk the vp->v_object list, for every page call the callback function
 968  * pointed by *page_check. If page_check returns non-zero, then mark the
 969  * page as modified and if VMODSORT is set, move it to the end of
 970  * v_object list. Moving makes sense only if we have at least two pages.
 971  */
 972 void
 973 pvn_vplist_setdirty(vnode_t *vp, int (*page_check)(page_t *))
 974 {
 975         page_t  *pp, *next, *end;
 976         int     shuffle;
 977
 978         vmobject_lock(&vp->v_object);
 979
 980         if (!vn_has_cached_data(vp)) {
 981                 vmobject_unlock(&vp->v_object);
 982                 return;
 983         }
 984
 985         end = vmobject_get_tail(&vp->v_object);
 986         pp = vmobject_get_head(&vp->v_object);
 987         shuffle = IS_VMODSORT(vp) && (pp != end);
 988
 989         for (;;) {
 990                 next = vmobject_get_next_loop(&vp->v_object, pp);
 991                 if (!PP_ISPVN_TAG(pp) && page_check(pp)) {
 992                         /*
 993                          * hat_setmod_only() in contrast to hat_setmod() does
 994                          * not shuffle the pages and does not grab the vnode
 995                          * page mutex. Exactly what we need.
 996                          */
 997                         hat_setmod_only(pp);
 998                         if (shuffle)
 999                                 vmobject_move_page_tail(&vp->v_object, pp);
1000                 }
1001                 /* Stop if we have just processed the last page. */
1002                 if (pp == end)
1003                         break;
1004                 pp = next;
1005         }
1006
1007         vmobject_unlock(&vp->v_object);
1008 }
1009
1010 /*
1011  * Zero out zbytes worth of data. Caller should be aware that this
1012  * routine may enter back into the fs layer (xxx_getpage). Locks
1013  * that the xxx_getpage routine may need should not be held while
1014  * calling this.
1015  */
1016 void
1017 pvn_vpzero(struct vnode *vp, uoff_t vplen, size_t zbytes)
1018 {
1019         caddr_t addr;
1020
1021         ASSERT(vp->v_type != VCHR);
1022
1023         if (!vn_has_cached_data(vp))
1024                 return;
1025
1026         /*
1027          * zbytes may be zero but there still may be some portion of
1028          * a page which needs clearing (since zbytes is a function
1029          * of filesystem block size, not pagesize.)
1030          */
1031         if (zbytes == 0 && (PAGESIZE - (vplen & PAGEOFFSET)) == 0)
1032                 return;
1033
1034         /*
1035          * We get the last page and handle the partial
1036          * zeroing via kernel mappings.  This will make the page
1037          * dirty so that we know that when this page is written
1038          * back, the zeroed information will go out with it.  If
1039          * the page is not currently in memory, then the kzero
1040          * operation will cause it to be brought it.  We use kzero
1041          * instead of bzero so that if the page cannot be read in
1042          * for any reason, the system will not panic.  We need
1043          * to zero out a minimum of the fs given zbytes, but we
1044          * might also have to do more to get the entire last page.
1045          */
1046
1047         if ((zbytes + (vplen & MAXBOFFSET)) > MAXBSIZE)
1048                 panic("pvn_vptrunc zbytes");
1049         addr = segmap_getmapflt(segkmap, vp, vplen,
1050             MAX(zbytes, PAGESIZE - (vplen & PAGEOFFSET)), 1, S_WRITE);
1051         (void) kzero(addr + (vplen & MAXBOFFSET),
1052             MAX(zbytes, PAGESIZE - (vplen & PAGEOFFSET)));
1053         (void) segmap_release(segkmap, addr, SM_WRITE | SM_ASYNC);
1054 }
1055
1056 /*
1057  * Handles common work of the fop_getpage routines by iterating page by page
1058  * calling the getpage helper for each.
1059  */
1060 int
1061 pvn_getpages(
1062         int (*getpage)(vnode_t *, uoff_t, size_t, uint_t *, page_t *[],
1063                 size_t, struct seg *, caddr_t, enum seg_rw, cred_t *),
1064         struct vnode *vp,
1065         uoff_t off,
1066         size_t len,
1067         uint_t *protp,
1068         page_t *pl[],
1069         size_t plsz,
1070         struct seg *seg,
1071         caddr_t addr,
1072         enum seg_rw rw,
1073         struct cred *cred)
1074 {
1075         page_t **ppp;
1076         uoff_t o, eoff;
1077         size_t sz, xlen;
1078         int err;
1079
1080         /* ensure that we have enough space */
1081         ASSERT(pl == NULL || plsz >= len);
1082
1083         /*
1084          * Loop one page at a time and let getapage function fill
1085          * in the next page in array.  We only allow one page to be
1086          * returned at a time (except for the last page) so that we
1087          * don't have any problems with duplicates and other such
1088          * painful problems.  This is a very simple minded algorithm,
1089          * but it does the job correctly.  We hope that the cost of a
1090          * getapage call for a resident page that we might have been
1091          * able to get from an earlier call doesn't cost too much.
1092          */
1093         ppp = pl;
1094         sz = (pl != NULL) ? PAGESIZE : 0;
1095         eoff = off + len;
1096         xlen = len;
1097         for (o = off; o < eoff; o += PAGESIZE, addr += PAGESIZE,
1098             xlen -= PAGESIZE) {
1099                 if (o + PAGESIZE >= eoff && pl != NULL) {
1100                         /*
1101                          * Last time through - allow the all of
1102                          * what's left of the pl[] array to be used.
1103                          */
1104                         sz = plsz - (o - off);
1105                 }
1106                 err = (*getpage)(vp, o, xlen, protp, ppp, sz, seg, addr,
1107                     rw, cred);
1108                 if (err) {
1109                         /*
1110                          * Release any pages we already got.
1111                          */
1112                         if (o > off && pl != NULL) {
1113                                 for (ppp = pl; *ppp != NULL; *ppp++ = NULL)
1114                                         (void) page_release(*ppp, 1);
1115                         }
1116                         break;
1117                 }
1118                 if (pl != NULL)
1119                         ppp++;
1120         }
1121         return (err);
1122 }
1123
1124 /*
1125  * Initialize the page list array.
1126  */
1127 /*ARGSUSED*/
1128 void
1129 pvn_plist_init(page_t *pp, page_t *pl[], size_t plsz,
1130     uoff_t off, size_t io_len, enum seg_rw rw)
1131 {
1132         ssize_t sz;
1133         page_t *ppcur, **ppp;
1134
1135         /*
1136          * Set up to load plsz worth
1137          * starting at the needed page.
1138          */
1139         while (pp != NULL && pp->p_offset != off) {
1140                 /*
1141                  * Remove page from the i/o list,
1142                  * release the i/o and the page lock.
1143                  */
1144                 ppcur = pp;
1145                 page_sub(&pp, ppcur);
1146                 page_io_unlock(ppcur);
1147                 (void) page_release(ppcur, 1);
1148         }
1149
1150         if (pp == NULL) {
1151                 pl[0] = NULL;
1152                 return;
1153         }
1154
1155         sz = plsz;
1156
1157         /*
1158          * Initialize the page list array.
1159          */
1160         ppp = pl;
1161         do {
1162                 ppcur = pp;
1163                 *ppp++ = ppcur;
1164                 page_sub(&pp, ppcur);
1165                 page_io_unlock(ppcur);
1166                 if (rw != S_CREATE)
1167                         page_downgrade(ppcur);
1168                 sz -= PAGESIZE;
1169         } while (sz > 0 && pp != NULL);
1170         *ppp = NULL;            /* terminate list */
1171
1172         /*
1173          * Now free the remaining pages that weren't
1174          * loaded in the page list.
1175          */
1176         while (pp != NULL) {
1177                 ppcur = pp;
1178                 page_sub(&pp, ppcur);
1179                 page_io_unlock(ppcur);
1180                 (void) page_release(ppcur, 1);
1181         }
1182 }