dmake: do not set MAKEFLAGS=k
[unleashed/tickless.git] / kernel / vm / vm_pvn.c
blob45f2f521f70d9e68019b1c9a37f0168f01f12b44
1 /*
2 * CDDL HEADER START
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
19 * CDDL HEADER END
22 * Copyright (c) 1986, 2010, Oracle and/or its affiliates. All rights reserved.
23 * Copyright 2015 Nexenta Systems, Inc. All rights reserved.
26 /* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */
27 /* All Rights Reserved */
30 * University Copyright- Copyright (c) 1982, 1986, 1988
31 * The Regents of the University of California
32 * All Rights Reserved
34 * University Acknowledgment- Portions of this document are derived from
35 * software developed by the University of California, Berkeley, and its
36 * contributors.
40 * VM - paged vnode.
42 * This file supplies vm support for the vnode operations that deal with pages.
44 #include <sys/types.h>
45 #include <sys/t_lock.h>
46 #include <sys/param.h>
47 #include <sys/sysmacros.h>
48 #include <sys/systm.h>
49 #include <sys/time.h>
50 #include <sys/buf.h>
51 #include <sys/vnode.h>
52 #include <sys/uio.h>
53 #include <sys/vmsystm.h>
54 #include <sys/mman.h>
55 #include <sys/vfs.h>
56 #include <sys/cred.h>
57 #include <sys/user.h>
58 #include <sys/kmem.h>
59 #include <sys/cmn_err.h>
60 #include <sys/debug.h>
61 #include <sys/cpuvar.h>
62 #include <sys/vtrace.h>
63 #include <sys/tnf_probe.h>
65 #include <vm/hat.h>
66 #include <vm/as.h>
67 #include <vm/seg.h>
68 #include <vm/rm.h>
69 #include <vm/pvn.h>
70 #include <vm/page.h>
71 #include <vm/seg_map.h>
72 #include <vm/seg_kmem.h>
73 #include <sys/fs/swapnode.h>
75 int pvn_nofodklust = 0;
76 int pvn_write_noklust = 0;
78 static struct kmem_cache *marker_cache = NULL;
81 * Find the largest contiguous block which contains `addr' for file offset
82 * `offset' in it while living within the file system block sizes (`vp_off'
83 * and `vp_len') and the address space limits for which no pages currently
84 * exist and which map to consecutive file offsets.
86 page_t *
87 pvn_read_kluster(
88 struct vnode *vp,
89 uoff_t off,
90 struct seg *seg,
91 caddr_t addr,
92 uoff_t *offp, /* return values */
93 size_t *lenp, /* return values */
94 uoff_t vp_off,
95 size_t vp_len,
96 int isra)
98 ssize_t deltaf, deltab;
99 page_t *pp;
100 page_t *plist = NULL;
101 spgcnt_t pagesavail;
102 uoff_t vp_end;
104 ASSERT(off >= vp_off && off < vp_off + vp_len);
107 * We only want to do klustering/read ahead if there
108 * is more than minfree pages currently available.
110 pagesavail = freemem - minfree;
112 if (pagesavail <= 0)
113 if (isra)
114 return (NULL); /* ra case - give up */
115 else
116 pagesavail = 1; /* must return a page */
118 /* We calculate in pages instead of bytes due to 32-bit overflows */
119 if (pagesavail < (spgcnt_t)btopr(vp_len)) {
121 * Don't have enough free memory for the
122 * max request, try sizing down vp request.
124 deltab = (ssize_t)(off - vp_off);
125 vp_len -= deltab;
126 vp_off += deltab;
127 if (pagesavail < btopr(vp_len)) {
129 * Still not enough memory, just settle for
130 * pagesavail which is at least 1.
132 vp_len = ptob(pagesavail);
136 vp_end = vp_off + vp_len;
137 ASSERT(off >= vp_off && off < vp_end);
139 if (isra && segop_kluster(seg, addr, 0))
140 return (NULL); /* segment driver says no */
142 if ((plist = page_create_va(&vp->v_object, off,
143 PAGESIZE, PG_EXCL | PG_WAIT, seg, addr)) == NULL)
144 return (NULL);
146 if (vp_len <= PAGESIZE || pvn_nofodklust) {
147 *offp = off;
148 *lenp = MIN(vp_len, PAGESIZE);
149 } else {
151 * Scan back from front by incrementing "deltab" and
152 * comparing "off" with "vp_off + deltab" to avoid
153 * "signed" versus "unsigned" conversion problems.
155 for (deltab = PAGESIZE; off >= vp_off + deltab;
156 deltab += PAGESIZE) {
158 * Call back to the segment driver to verify that
159 * the klustering/read ahead operation makes sense.
161 if (segop_kluster(seg, addr, -deltab))
162 break; /* page not eligible */
163 if ((pp = page_create_va(&vp->v_object, off - deltab,
164 PAGESIZE, PG_EXCL, seg, addr - deltab))
165 == NULL)
166 break; /* already have the page */
168 * Add page to front of page list.
170 page_add(&plist, pp);
172 deltab -= PAGESIZE;
174 /* scan forward from front */
175 for (deltaf = PAGESIZE; off + deltaf < vp_end;
176 deltaf += PAGESIZE) {
178 * Call back to the segment driver to verify that
179 * the klustering/read ahead operation makes sense.
181 if (segop_kluster(seg, addr, deltaf))
182 break; /* page not file extension */
183 if ((pp = page_create_va(&vp->v_object, off + deltaf,
184 PAGESIZE, PG_EXCL, seg, addr + deltaf))
185 == NULL)
186 break; /* already have page */
189 * Add page to end of page list.
191 page_add(&plist, pp);
192 plist = plist->p_next;
194 *offp = off = off - deltab;
195 *lenp = deltab + deltaf;
196 ASSERT(off >= vp_off);
199 * If we ended up getting more than was actually
200 * requested, retract the returned length to only
201 * reflect what was requested. This might happen
202 * if we were allowed to kluster pages across a
203 * span of (say) 5 frags, and frag size is less
204 * than PAGESIZE. We need a whole number of
205 * pages to contain those frags, but the returned
206 * size should only allow the returned range to
207 * extend as far as the end of the frags.
209 if ((vp_off + vp_len) < (off + *lenp)) {
210 ASSERT(vp_end > off);
211 *lenp = vp_end - off;
214 return (plist);
218 * Handle pages for this vnode on either side of the page "pp"
219 * which has been locked by the caller. This routine will also
220 * do klustering in the range [vp_off, vp_off + vp_len] up
221 * until a page which is not found. The offset and length
222 * of pages included is returned in "*offp" and "*lenp".
224 * Returns a list of dirty locked pages all ready to be
225 * written back.
227 page_t *
228 pvn_write_kluster(
229 struct vnode *vp,
230 page_t *pp,
231 uoff_t *offp, /* return values */
232 size_t *lenp, /* return values */
233 uoff_t vp_off,
234 size_t vp_len,
235 int flags)
237 uoff_t off;
238 page_t *dirty;
239 size_t deltab, deltaf;
240 se_t se;
241 uoff_t vp_end;
243 off = pp->p_offset;
246 * Kustering should not be done if we are invalidating
247 * pages since we could destroy pages that belong to
248 * some other process if this is a swap vnode.
250 if (pvn_write_noklust || ((flags & B_INVAL) && IS_SWAPVP(vp))) {
251 *offp = off;
252 *lenp = PAGESIZE;
253 return (pp);
256 if (flags & (B_FREE | B_INVAL))
257 se = SE_EXCL;
258 else
259 se = SE_SHARED;
261 dirty = pp;
263 * Scan backwards looking for pages to kluster by incrementing
264 * "deltab" and comparing "off" with "vp_off + deltab" to
265 * avoid "signed" versus "unsigned" conversion problems.
267 for (deltab = PAGESIZE; off >= vp_off + deltab; deltab += PAGESIZE) {
268 pp = page_lookup_nowait(&vp->v_object, off - deltab, se);
269 if (pp == NULL)
270 break; /* page not found */
271 if (pvn_getdirty(pp, flags | B_DELWRI) == 0)
272 break;
273 page_add(&dirty, pp);
275 deltab -= PAGESIZE;
277 vp_end = vp_off + vp_len;
278 /* now scan forwards looking for pages to kluster */
279 for (deltaf = PAGESIZE; off + deltaf < vp_end; deltaf += PAGESIZE) {
280 pp = page_lookup_nowait(&vp->v_object, off + deltaf, se);
281 if (pp == NULL)
282 break; /* page not found */
283 if (pvn_getdirty(pp, flags | B_DELWRI) == 0)
284 break;
285 page_add(&dirty, pp);
286 dirty = dirty->p_next;
289 *offp = off - deltab;
290 *lenp = deltab + deltaf;
291 return (dirty);
295 * Generic entry point used to release the "shared/exclusive" lock
296 * and the "p_iolock" on pages after i/o is complete.
298 void
299 pvn_io_done(page_t *plist)
301 page_t *pp;
303 while (plist != NULL) {
304 pp = plist;
305 page_sub(&plist, pp);
306 page_io_unlock(pp);
307 page_unlock(pp);
312 * Entry point to be used by file system getpage subr's and
313 * other such routines which either want to unlock pages (B_ASYNC
314 * request) or destroy a list of pages if an error occurred.
316 void
317 pvn_read_done(page_t *plist, int flags)
319 page_t *pp;
321 while (plist != NULL) {
322 pp = plist;
323 page_sub(&plist, pp);
324 page_io_unlock(pp);
325 if (flags & B_ERROR) {
326 VN_DISPOSE(pp, B_INVAL, 0, kcred);
327 } else {
328 (void) page_release(pp, 0);
334 * Automagic pageout.
335 * When memory gets tight, start freeing pages popping out of the
336 * write queue.
338 int write_free = 1;
339 pgcnt_t pages_before_pager = 200; /* LMXXX */
342 * Routine to be called when page-out's complete.
343 * The caller, typically fop_putpage, has to explicity call this routine
344 * after waiting for i/o to complete (biowait) to free the list of
345 * pages associated with the buffer. These pages must be locked
346 * before i/o is initiated.
348 * If a write error occurs, the pages are marked as modified
349 * so the write will be re-tried later.
352 void
353 pvn_write_done(page_t *plist, int flags)
355 int dfree = 0;
356 int pgrec = 0;
357 int pgout = 0;
358 int pgpgout = 0;
359 int anonpgout = 0;
360 int anonfree = 0;
361 int fspgout = 0;
362 int fsfree = 0;
363 int execpgout = 0;
364 int execfree = 0;
365 page_t *pp;
366 struct cpu *cpup;
367 struct vnode *vp = NULL; /* for probe */
368 uint_t ppattr;
370 ASSERT((flags & B_READ) == 0);
373 * If we are about to start paging anyway, start freeing pages.
375 if (write_free && freemem < lotsfree + pages_before_pager &&
376 (flags & B_ERROR) == 0) {
377 flags |= B_FREE;
381 * Handle each page involved in the i/o operation.
383 while (plist != NULL) {
384 pp = plist;
385 ASSERT(PAGE_LOCKED(pp) && page_iolock_assert(pp));
386 page_sub(&plist, pp);
388 /* Kernel probe support */
389 if (vp == NULL)
390 vp = pp->p_vnode;
392 if (((flags & B_ERROR) == 0) && IS_VMODSORT(vp)) {
394 * Move page to the top of the v_page list.
395 * Skip pages modified during IO.
397 vmobject_lock(&vp->v_object);
398 if (!hat_ismod(pp))
399 vmobject_move_page_tail(&vp->v_object, pp);
400 vmobject_unlock(&vp->v_object);
403 if (flags & B_ERROR) {
405 * Write operation failed. We don't want
406 * to destroy (or free) the page unless B_FORCE
407 * is set. We set the mod bit again and release
408 * all locks on the page so that it will get written
409 * back again later when things are hopefully
410 * better again.
411 * If B_INVAL and B_FORCE is set we really have
412 * to destroy the page.
414 if ((flags & (B_INVAL|B_FORCE)) == (B_INVAL|B_FORCE)) {
415 page_io_unlock(pp);
416 VN_DISPOSE(pp, B_INVAL, 0, kcred);
417 } else {
418 hat_setmod_only(pp);
419 page_io_unlock(pp);
420 page_unlock(pp);
422 } else if (flags & B_INVAL) {
424 * XXX - Failed writes with B_INVAL set are
425 * not handled appropriately.
427 page_io_unlock(pp);
428 VN_DISPOSE(pp, B_INVAL, 0, kcred);
429 } else if (flags & B_FREE ||!hat_page_is_mapped(pp)) {
431 * Update statistics for pages being paged out
433 if (pp->p_vnode) {
434 if (IS_SWAPFSVP(pp->p_vnode)) {
435 anonpgout++;
436 } else {
437 if (pp->p_vnode->v_flag & VVMEXEC) {
438 execpgout++;
439 } else {
440 fspgout++;
444 page_io_unlock(pp);
445 pgout = 1;
446 pgpgout++;
449 * The page_struct_lock need not be acquired to
450 * examine "p_lckcnt" and "p_cowcnt" since we'll
451 * have an "exclusive" lock if the upgrade succeeds.
453 if (page_tryupgrade(pp) &&
454 pp->p_lckcnt == 0 && pp->p_cowcnt == 0) {
456 * Check if someone has reclaimed the
457 * page. If ref and mod are not set, no
458 * one is using it so we can free it.
459 * The rest of the system is careful
460 * to use the NOSYNC flag to unload
461 * translations set up for i/o w/o
462 * affecting ref and mod bits.
464 * Obtain a copy of the real hardware
465 * mod bit using hat_pagesync(pp, HAT_DONTZERO)
466 * to avoid having to flush the cache.
468 ppattr = hat_pagesync(pp, HAT_SYNC_DONTZERO |
469 HAT_SYNC_STOPON_MOD);
470 ck_refmod:
471 if (!(ppattr & (P_REF | P_MOD))) {
472 if (hat_page_is_mapped(pp)) {
474 * Doesn't look like the page
475 * was modified so now we
476 * really have to unload the
477 * translations. Meanwhile
478 * another CPU could've
479 * modified it so we have to
480 * check again. We don't loop
481 * forever here because now
482 * the translations are gone
483 * and no one can get a new one
484 * since we have the "exclusive"
485 * lock on the page.
487 (void) hat_pageunload(pp,
488 HAT_FORCE_PGUNLOAD);
489 ppattr = hat_page_getattr(pp,
490 P_REF | P_MOD);
491 goto ck_refmod;
494 * Update statistics for pages being
495 * freed
497 if (pp->p_vnode) {
498 if (IS_SWAPFSVP(pp->p_vnode)) {
499 anonfree++;
500 } else {
501 if (pp->p_vnode->v_flag
502 & VVMEXEC) {
503 execfree++;
504 } else {
505 fsfree++;
510 VN_DISPOSE(pp, B_FREE,
511 (flags & B_DONTNEED), kcred);
512 dfree++;
513 } else {
514 page_unlock(pp);
515 pgrec++;
517 } else {
519 * Page is either `locked' in memory
520 * or was reclaimed and now has a
521 * "shared" lock, so release it.
523 page_unlock(pp);
525 } else {
527 * Neither B_FREE nor B_INVAL nor B_ERROR.
528 * Just release locks.
530 page_io_unlock(pp);
531 page_unlock(pp);
535 CPU_STATS_ENTER_K();
536 cpup = CPU; /* get cpup now that CPU cannot change */
537 CPU_STATS_ADDQ(cpup, vm, dfree, dfree);
538 CPU_STATS_ADDQ(cpup, vm, pgrec, pgrec);
539 CPU_STATS_ADDQ(cpup, vm, pgout, pgout);
540 CPU_STATS_ADDQ(cpup, vm, pgpgout, pgpgout);
541 CPU_STATS_ADDQ(cpup, vm, anonpgout, anonpgout);
542 CPU_STATS_ADDQ(cpup, vm, anonfree, anonfree);
543 CPU_STATS_ADDQ(cpup, vm, fspgout, fspgout);
544 CPU_STATS_ADDQ(cpup, vm, fsfree, fsfree);
545 CPU_STATS_ADDQ(cpup, vm, execpgout, execpgout);
546 CPU_STATS_ADDQ(cpup, vm, execfree, execfree);
547 CPU_STATS_EXIT_K();
551 * Flags are composed of {B_ASYNC, B_INVAL, B_FREE, B_DONTNEED, B_DELWRI,
552 * B_TRUNC, B_FORCE}. B_DELWRI indicates that this page is part of a kluster
553 * operation and is only to be considered if it doesn't involve any
554 * waiting here. B_TRUNC indicates that the file is being truncated
555 * and so no i/o needs to be done. B_FORCE indicates that the page
556 * must be destroyed so don't try wrting it out.
558 * The caller must ensure that the page is locked. Returns 1, if
559 * the page should be written back (the "iolock" is held in this
560 * case), or 0 if the page has been dealt with or has been
561 * unlocked.
564 pvn_getdirty(page_t *pp, int flags)
566 ASSERT((flags & (B_INVAL | B_FREE)) ?
567 PAGE_EXCL(pp) : PAGE_SHARED(pp));
568 ASSERT(PP_ISFREE(pp) == 0);
571 * If trying to invalidate or free a logically `locked' page,
572 * forget it. Don't need page_struct_lock to check p_lckcnt and
573 * p_cowcnt as the page is exclusively locked.
575 if ((flags & (B_INVAL | B_FREE)) && !(flags & (B_TRUNC|B_FORCE)) &&
576 (pp->p_lckcnt != 0 || pp->p_cowcnt != 0)) {
577 page_unlock(pp);
578 return (0);
582 * Now acquire the i/o lock so we can add it to the dirty
583 * list (if necessary). We avoid blocking on the i/o lock
584 * in the following cases:
586 * If B_DELWRI is set, which implies that this request is
587 * due to a klustering operartion.
589 * If this is an async (B_ASYNC) operation and we are not doing
590 * invalidation (B_INVAL) [The current i/o or fsflush will ensure
591 * that the the page is written out].
593 if ((flags & B_DELWRI) || ((flags & (B_INVAL | B_ASYNC)) == B_ASYNC)) {
594 if (!page_io_trylock(pp)) {
595 page_unlock(pp);
596 return (0);
598 } else {
599 page_io_lock(pp);
603 * If we want to free or invalidate the page then
604 * we need to unload it so that anyone who wants
605 * it will have to take a minor fault to get it.
606 * Otherwise, we're just writing the page back so we
607 * need to sync up the hardwre and software mod bit to
608 * detect any future modifications. We clear the
609 * software mod bit when we put the page on the dirty
610 * list.
612 if (flags & (B_INVAL | B_FREE)) {
613 (void) hat_pageunload(pp, HAT_FORCE_PGUNLOAD);
614 } else {
615 (void) hat_pagesync(pp, HAT_SYNC_ZERORM);
618 if (!hat_ismod(pp) || (flags & B_TRUNC)) {
620 * Don't need to add it to the
621 * list after all.
623 page_io_unlock(pp);
624 if (flags & B_INVAL) {
625 VN_DISPOSE(pp, B_INVAL, 0, kcred);
626 } else if (flags & B_FREE) {
627 VN_DISPOSE(pp, B_FREE, (flags & B_DONTNEED), kcred);
628 } else {
630 * This is advisory path for the callers
631 * of fop_putpage() who prefer freeing the
632 * page _only_ if no one else is accessing it.
633 * E.g. segmap_release()
635 * The above hat_ismod() check is useless because:
636 * (1) we may not be holding SE_EXCL lock;
637 * (2) we've not unloaded _all_ translations
639 * Let page_release() do the heavy-lifting.
641 (void) page_release(pp, 1);
643 return (0);
647 * Page is dirty, get it ready for the write back
648 * and add page to the dirty list.
650 hat_clrrefmod(pp);
653 * If we're going to free the page when we're done
654 * then we can let others try to use it starting now.
655 * We'll detect the fact that they used it when the
656 * i/o is done and avoid freeing the page.
658 if (flags & B_FREE)
659 page_downgrade(pp);
661 return (1);
665 /*ARGSUSED*/
666 static int
667 marker_constructor(void *buf, void *cdrarg, int kmflags)
669 page_t *mark = buf;
670 bzero(mark, sizeof (page_t));
671 PP_SETPVN_TAG(mark);
672 return (0);
675 void
676 pvn_init()
678 marker_cache = kmem_cache_create("marker_cache",
679 sizeof (page_t), 0, marker_constructor,
680 NULL, NULL, NULL, NULL, 0);
683 static inline void
684 move_marker(struct vnode *vnode, struct page *ref, struct page *mark)
686 list_remove(&vnode->v_object.list, mark);
687 list_insert_before(&vnode->v_object.list, ref, mark);
691 * Process a vnode's page list for all pages whose offset is >= off.
692 * Pages are to either be free'd, invalidated, or written back to disk.
694 * An "exclusive" lock is acquired for each page if B_INVAL or B_FREE
695 * is specified, otherwise they are "shared" locked.
697 * Flags are {B_ASYNC, B_INVAL, B_FREE, B_DONTNEED, B_TRUNC}
699 * Special marker page_t's are inserted in the list in order
700 * to keep track of where we are in the list when locks are dropped.
702 * Note the list is circular and insertions can happen only at the
703 * head and tail of the list. The algorithm ensures visiting all pages
704 * on the list in the following way:
706 * Drop two marker pages at the end of the list.
708 * Move one marker page backwards towards the start of the list until
709 * it is at the list head, processing the pages passed along the way.
711 * Due to race conditions when the vnode page mutex is dropped,
712 * additional pages can be added to either end of the list, so we'll
713 * continue to move the marker and process pages until it is up against
714 * the end marker.
716 * There is one special exit condition. If we are processing a VMODSORT
717 * vnode and only writing back modified pages, we can stop as soon as
718 * we run into an unmodified page. This makes fsync(3) operations fast.
721 pvn_vplist_dirty(
722 vnode_t *vp,
723 uoff_t off,
724 int (*putapage)(vnode_t *, page_t *, uoff_t *,
725 size_t *, int, cred_t *),
726 int flags,
727 cred_t *cred)
729 page_t *pp;
730 page_t *mark; /* marker page that moves toward head */
731 page_t *end; /* marker page at end of list */
732 int err = 0;
733 int error;
734 se_t se;
736 ASSERT(vp->v_type != VCHR);
738 if (!vn_has_cached_data(vp))
739 return (0);
743 * Serialize vplist_dirty operations on this vnode by setting VVMLOCK.
745 * Don't block on VVMLOCK if B_ASYNC is set. This prevents sync()
746 * from getting blocked while flushing pages to a dead NFS server.
748 mutex_enter(&vp->v_lock);
749 if ((vp->v_flag & VVMLOCK) && (flags & B_ASYNC)) {
750 mutex_exit(&vp->v_lock);
751 return (EAGAIN);
754 while (vp->v_flag & VVMLOCK)
755 cv_wait(&vp->v_cv, &vp->v_lock);
757 if (!vn_has_cached_data(vp)) {
758 mutex_exit(&vp->v_lock);
759 return (0);
762 vp->v_flag |= VVMLOCK;
763 mutex_exit(&vp->v_lock);
767 * Set up the marker pages used to walk the list
769 end = kmem_cache_alloc(marker_cache, KM_SLEEP);
770 end->p_object = &vp->v_object;
771 end->p_vnode = vp;
772 end->p_offset = (uoff_t)-2;
773 mark = kmem_cache_alloc(marker_cache, KM_SLEEP);
774 mark->p_object = &vp->v_object;
775 mark->p_vnode = vp;
776 mark->p_offset = (uoff_t)-1;
779 * Grab the lock protecting the vnode's page list
780 * note that this lock is dropped at times in the loop.
782 vmobject_lock(&vp->v_object);
783 if (!vn_has_cached_data(vp))
784 goto leave;
787 * insert the markers and loop through the list of pages
789 vmobject_add_page_tail(&vp->v_object, mark);
790 vmobject_add_page_tail(&vp->v_object, end);
792 for (;;) {
795 * If only doing an async write back, then we can
796 * stop as soon as we get to start of the list.
798 if (flags == B_ASYNC && vmobject_get_head(&vp->v_object) == mark)
799 break;
801 pp = vmobject_get_prev_loop(&vp->v_object, mark);
804 * otherwise stop when we've gone through all the pages
806 if (pp == end)
807 break;
809 VERIFY(pp->p_object == &vp->v_object);
810 ASSERT(pp->p_vnode == vp);
813 * If just flushing dirty pages to disk and this vnode
814 * is using a sorted list of pages, we can stop processing
815 * as soon as we find an unmodified page. Since all the
816 * modified pages are visited first.
818 if (IS_VMODSORT(vp) &&
819 !(flags & (B_INVAL | B_FREE | B_TRUNC))) {
820 if (!hat_ismod(pp) && !page_io_locked(pp)) {
821 #ifdef DEBUG
823 * For debug kernels examine what should be
824 * all the remaining clean pages, asserting
825 * that they are not modified.
827 page_t *chk = pp;
828 int attr;
830 move_marker(vp, pp, mark);
832 do {
833 chk = vmobject_get_prev_loop(&vp->v_object,
834 chk);
835 ASSERT(chk != end);
836 if (chk == mark)
837 continue;
838 attr = hat_page_getattr(chk, P_MOD |
839 P_REF);
840 if ((attr & P_MOD) == 0)
841 continue;
842 panic("v_object list not all clean: "
843 "page_t*=%p vnode=%p off=%lx "
844 "attr=0x%x last clean page_t*=%p\n",
845 chk, chk->p_vnode,
846 (long)chk->p_offset, attr, pp);
847 } while (chk != vmobject_get_head(&vp->v_object));
848 #endif
849 break;
850 } else if (!(flags & B_ASYNC) && !hat_ismod(pp)) {
852 * Couldn't get io lock, wait until IO is done.
853 * Block only for sync IO since we don't want
854 * to block async IO.
856 vmobject_unlock(&vp->v_object);
857 page_io_wait(pp);
858 vmobject_lock(&vp->v_object);
859 continue;
864 * Skip this page if the offset is out of the desired range.
865 * Just move the marker and continue.
867 if (pp->p_offset < off) {
868 move_marker(vp, pp, mark);
869 continue;
873 * If we are supposed to invalidate or free this
874 * page, then we need an exclusive lock.
876 se = (flags & (B_INVAL | B_FREE)) ? SE_EXCL : SE_SHARED;
879 * We must acquire the page lock for all synchronous
880 * operations (invalidate, free and write).
882 if ((flags & B_INVAL) != 0 || (flags & B_ASYNC) == 0) {
884 * If the page_lock() drops the mutex
885 * we must retry the loop.
887 if (!page_lock(pp, se, &vp->v_object, P_NO_RECLAIM))
888 continue;
891 * It's ok to move the marker page now.
893 move_marker(vp, pp, mark);
894 } else {
897 * update the marker page for all remaining cases
899 move_marker(vp, pp, mark);
902 * For write backs, If we can't lock the page, it's
903 * invalid or in the process of being destroyed. Skip
904 * it, assuming someone else is writing it.
906 if (!page_trylock(pp, se))
907 continue;
910 VERIFY(pp->p_object == &vp->v_object);
911 ASSERT(pp->p_vnode == vp);
914 * Successfully locked the page, now figure out what to
915 * do with it. Free pages are easily dealt with, invalidate
916 * if desired or just go on to the next page.
918 if (PP_ISFREE(pp)) {
919 if ((flags & B_INVAL) == 0) {
920 page_unlock(pp);
921 continue;
925 * Invalidate (destroy) the page.
927 vmobject_unlock(&vp->v_object);
928 page_destroy_free(pp);
929 vmobject_lock(&vp->v_object);
930 continue;
934 * pvn_getdirty() figures out what do do with a dirty page.
935 * If the page is dirty, the putapage() routine will write it
936 * and will kluster any other adjacent dirty pages it can.
938 * pvn_getdirty() and `(*putapage)' unlock the page.
940 vmobject_unlock(&vp->v_object);
941 if (pvn_getdirty(pp, flags)) {
942 error = (*putapage)(vp, pp, NULL, NULL, flags, cred);
943 if (!err)
944 err = error;
946 vmobject_lock(&vp->v_object);
948 vmobject_remove_page(&vp->v_object, mark);
949 vmobject_remove_page(&vp->v_object, end);
951 leave:
953 * Release v_object mutex, also VVMLOCK and wakeup blocked
954 * threads
956 vmobject_unlock(&vp->v_object);
957 kmem_cache_free(marker_cache, mark);
958 kmem_cache_free(marker_cache, end);
959 mutex_enter(&vp->v_lock);
960 vp->v_flag &= ~VVMLOCK;
961 cv_broadcast(&vp->v_cv);
962 mutex_exit(&vp->v_lock);
963 return (err);
967 * Walk the vp->v_object list, for every page call the callback function
968 * pointed by *page_check. If page_check returns non-zero, then mark the
969 * page as modified and if VMODSORT is set, move it to the end of
970 * v_object list. Moving makes sense only if we have at least two pages.
972 void
973 pvn_vplist_setdirty(vnode_t *vp, int (*page_check)(page_t *))
975 page_t *pp, *next, *end;
976 int shuffle;
978 vmobject_lock(&vp->v_object);
980 if (!vn_has_cached_data(vp)) {
981 vmobject_unlock(&vp->v_object);
982 return;
985 end = vmobject_get_tail(&vp->v_object);
986 pp = vmobject_get_head(&vp->v_object);
987 shuffle = IS_VMODSORT(vp) && (pp != end);
989 for (;;) {
990 next = vmobject_get_next_loop(&vp->v_object, pp);
991 if (!PP_ISPVN_TAG(pp) && page_check(pp)) {
993 * hat_setmod_only() in contrast to hat_setmod() does
994 * not shuffle the pages and does not grab the vnode
995 * page mutex. Exactly what we need.
997 hat_setmod_only(pp);
998 if (shuffle)
999 vmobject_move_page_tail(&vp->v_object, pp);
1001 /* Stop if we have just processed the last page. */
1002 if (pp == end)
1003 break;
1004 pp = next;
1007 vmobject_unlock(&vp->v_object);
1011 * Zero out zbytes worth of data. Caller should be aware that this
1012 * routine may enter back into the fs layer (xxx_getpage). Locks
1013 * that the xxx_getpage routine may need should not be held while
1014 * calling this.
1016 void
1017 pvn_vpzero(struct vnode *vp, uoff_t vplen, size_t zbytes)
1019 caddr_t addr;
1021 ASSERT(vp->v_type != VCHR);
1023 if (!vn_has_cached_data(vp))
1024 return;
1027 * zbytes may be zero but there still may be some portion of
1028 * a page which needs clearing (since zbytes is a function
1029 * of filesystem block size, not pagesize.)
1031 if (zbytes == 0 && (PAGESIZE - (vplen & PAGEOFFSET)) == 0)
1032 return;
1035 * We get the last page and handle the partial
1036 * zeroing via kernel mappings. This will make the page
1037 * dirty so that we know that when this page is written
1038 * back, the zeroed information will go out with it. If
1039 * the page is not currently in memory, then the kzero
1040 * operation will cause it to be brought it. We use kzero
1041 * instead of bzero so that if the page cannot be read in
1042 * for any reason, the system will not panic. We need
1043 * to zero out a minimum of the fs given zbytes, but we
1044 * might also have to do more to get the entire last page.
1047 if ((zbytes + (vplen & MAXBOFFSET)) > MAXBSIZE)
1048 panic("pvn_vptrunc zbytes");
1049 addr = segmap_getmapflt(segkmap, vp, vplen,
1050 MAX(zbytes, PAGESIZE - (vplen & PAGEOFFSET)), 1, S_WRITE);
1051 (void) kzero(addr + (vplen & MAXBOFFSET),
1052 MAX(zbytes, PAGESIZE - (vplen & PAGEOFFSET)));
1053 (void) segmap_release(segkmap, addr, SM_WRITE | SM_ASYNC);
1057 * Handles common work of the fop_getpage routines by iterating page by page
1058 * calling the getpage helper for each.
1061 pvn_getpages(
1062 int (*getpage)(vnode_t *, uoff_t, size_t, uint_t *, page_t *[],
1063 size_t, struct seg *, caddr_t, enum seg_rw, cred_t *),
1064 struct vnode *vp,
1065 uoff_t off,
1066 size_t len,
1067 uint_t *protp,
1068 page_t *pl[],
1069 size_t plsz,
1070 struct seg *seg,
1071 caddr_t addr,
1072 enum seg_rw rw,
1073 struct cred *cred)
1075 page_t **ppp;
1076 uoff_t o, eoff;
1077 size_t sz, xlen;
1078 int err;
1080 /* ensure that we have enough space */
1081 ASSERT(pl == NULL || plsz >= len);
1084 * Loop one page at a time and let getapage function fill
1085 * in the next page in array. We only allow one page to be
1086 * returned at a time (except for the last page) so that we
1087 * don't have any problems with duplicates and other such
1088 * painful problems. This is a very simple minded algorithm,
1089 * but it does the job correctly. We hope that the cost of a
1090 * getapage call for a resident page that we might have been
1091 * able to get from an earlier call doesn't cost too much.
1093 ppp = pl;
1094 sz = (pl != NULL) ? PAGESIZE : 0;
1095 eoff = off + len;
1096 xlen = len;
1097 for (o = off; o < eoff; o += PAGESIZE, addr += PAGESIZE,
1098 xlen -= PAGESIZE) {
1099 if (o + PAGESIZE >= eoff && pl != NULL) {
1101 * Last time through - allow the all of
1102 * what's left of the pl[] array to be used.
1104 sz = plsz - (o - off);
1106 err = (*getpage)(vp, o, xlen, protp, ppp, sz, seg, addr,
1107 rw, cred);
1108 if (err) {
1110 * Release any pages we already got.
1112 if (o > off && pl != NULL) {
1113 for (ppp = pl; *ppp != NULL; *ppp++ = NULL)
1114 (void) page_release(*ppp, 1);
1116 break;
1118 if (pl != NULL)
1119 ppp++;
1121 return (err);
1125 * Initialize the page list array.
1127 /*ARGSUSED*/
1128 void
1129 pvn_plist_init(page_t *pp, page_t *pl[], size_t plsz,
1130 uoff_t off, size_t io_len, enum seg_rw rw)
1132 ssize_t sz;
1133 page_t *ppcur, **ppp;
1136 * Set up to load plsz worth
1137 * starting at the needed page.
1139 while (pp != NULL && pp->p_offset != off) {
1141 * Remove page from the i/o list,
1142 * release the i/o and the page lock.
1144 ppcur = pp;
1145 page_sub(&pp, ppcur);
1146 page_io_unlock(ppcur);
1147 (void) page_release(ppcur, 1);
1150 if (pp == NULL) {
1151 pl[0] = NULL;
1152 return;
1155 sz = plsz;
1158 * Initialize the page list array.
1160 ppp = pl;
1161 do {
1162 ppcur = pp;
1163 *ppp++ = ppcur;
1164 page_sub(&pp, ppcur);
1165 page_io_unlock(ppcur);
1166 if (rw != S_CREATE)
1167 page_downgrade(ppcur);
1168 sz -= PAGESIZE;
1169 } while (sz > 0 && pp != NULL);
1170 *ppp = NULL; /* terminate list */
1173 * Now free the remaining pages that weren't
1174 * loaded in the page list.
1176 while (pp != NULL) {
1177 ppcur = pp;
1178 page_sub(&pp, ppcur);
1179 page_io_unlock(ppcur);
1180 (void) page_release(ppcur, 1);