4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
22 * Copyright (c) 1991, 2010, Oracle and/or its affiliates. All rights reserved.
23 * Copyright 2015 Nexenta Systems, Inc. All rights reserved.
26 #include <sys/types.h>
27 #include <sys/param.h>
28 #include <sys/systm.h>
31 #include <sys/errno.h>
32 #include <sys/vnode.h>
34 #include <sys/cmn_err.h>
37 #include <sys/vmsystm.h>
38 #include <sys/vtrace.h>
39 #include <sys/debug.h>
40 #include <sys/sysmacros.h>
43 #include <sys/fs/swapnode.h>
48 #include <sys/fs_subr.h>
50 #include <vm/seg_kp.h>
53 * Define the routines within this file.
55 static int swap_getpage(struct vnode
*vp
, offset_t off
, size_t len
,
56 uint_t
*protp
, struct page
**plarr
, size_t plsz
, struct seg
*seg
,
57 caddr_t addr
, enum seg_rw rw
, struct cred
*cr
, caller_context_t
*ct
);
58 static int swap_putpage(struct vnode
*vp
, offset_t off
, size_t len
,
59 int flags
, struct cred
*cr
, caller_context_t
*ct
);
60 static void swap_inactive(struct vnode
*vp
, struct cred
*cr
,
61 caller_context_t
*ct
);
62 static void swap_dispose(vnode_t
*vp
, page_t
*pp
, int fl
, int dn
,
63 cred_t
*cr
, caller_context_t
*ct
);
65 static int swap_getapage(struct vnode
*vp
, uoff_t off
, size_t len
,
66 uint_t
*protp
, page_t
**plarr
, size_t plsz
,
67 struct seg
*seg
, caddr_t addr
, enum seg_rw rw
, struct cred
*cr
);
69 int swap_getconpage(struct vnode
*vp
, uoff_t off
, size_t len
,
70 uint_t
*protp
, page_t
**plarr
, size_t plsz
, page_t
*conpp
,
71 uint_t
*pszc
, spgcnt_t
*nreloc
, struct seg
*seg
, caddr_t addr
,
72 enum seg_rw rw
, struct cred
*cr
);
74 static int swap_putapage(struct vnode
*vp
, page_t
*pp
, uoff_t
*off
,
75 size_t *lenp
, int flags
, struct cred
*cr
);
77 const struct vnodeops swap_vnodeops
= {
78 .vnop_name
= "swapfs",
79 .vop_inactive
= swap_inactive
,
80 .vop_getpage
= swap_getpage
,
81 .vop_putpage
= swap_putpage
,
82 .vop_dispose
= swap_dispose
,
83 .vop_setfl
= fs_nosys
,
84 .vop_poll
= (void *) fs_nosys
,
85 .vop_pathconf
= fs_nosys
,
86 .vop_getsecattr
= fs_nosys
,
87 .vop_shrlock
= fs_nosys
,
97 SWAPFS_PRINT(SWAP_VOPS
, "swap_inactive: vp %x\n", vp
, 0, 0, 0, 0);
101 * Return all the pages from [off..off+len] in given file
116 caller_context_t
*ct
)
118 SWAPFS_PRINT(SWAP_VOPS
, "swap_getpage: vp %p, off %llx, len %lx\n",
119 (void *)vp
, off
, len
, 0, 0);
121 TRACE_3(TR_FAC_SWAPFS
, TR_SWAPFS_GETPAGE
,
122 "swapfs getpage:vp %p off %llx len %ld",
123 (void *)vp
, off
, len
);
125 return (pvn_getpages(swap_getapage
, vp
, (uoff_t
)off
, len
, protp
,
126 pl
, plsz
, seg
, addr
, rw
, cr
));
130 * Called from pvn_getpages to get a particular page.
146 struct page
*pp
, *rpp
;
149 struct vnode
*pvp
= NULL
;
156 SWAPFS_PRINT(SWAP_VOPS
, "swap_getapage: vp %p, off %llx, len %lx\n",
160 * Until there is a call-back mechanism to cause SEGKP
161 * pages to be unlocked, make them non-relocatable.
163 if (SEG_IS_SEGKP(seg
))
164 flag_noreloc
= PG_NORELOC
;
171 lock
= (rw
== S_CREATE
? SE_EXCL
: SE_SHARED
);
174 if (pp
= page_lookup(&vp
->v_object
, off
, lock
)) {
176 * In very rare instances, a segkp page may have been
177 * relocated outside of the kernel by the kernel cage
178 * due to the window between page_unlock() and
179 * fop_putpage() in segkp_unlock(). Due to the
180 * rareness of these occurances, the solution is to
181 * relocate the page to a P_NORELOC page.
183 if (flag_noreloc
!= 0) {
184 if (!PP_ISNORELOC(pp
) && kcage_on
) {
185 if (lock
!= SE_EXCL
) {
187 if (!page_tryupgrade(pp
)) {
194 if (page_relocate_cage(&pp
, &rpp
) != 0)
195 panic("swap_getapage: "
196 "page_relocate_cage failed");
212 pp
= page_create_va(&vp
->v_object
, off
, PAGESIZE
,
213 PG_WAIT
| PG_EXCL
| flag_noreloc
,
216 * Someone raced in and created the page after we did the
217 * lookup but before we did the create, so go back and
218 * try to look it up again.
222 if (rw
!= S_CREATE
) {
223 err
= swap_getphysname(vp
, off
, &pvp
, &poff
);
228 flags
= (pl
== NULL
? B_ASYNC
|B_READ
: B_READ
);
229 err
= fop_pageio(pvp
, pp
, poff
,
230 PAGESIZE
, flags
, cr
, NULL
);
233 ahm
= AH_MUTEX(vp
, off
);
236 ap
= swap_anon(vp
, off
);
238 panic("swap_getapage:"
242 if (ap
->an_pvp
== pvp
&&
243 ap
->an_poff
== poff
) {
244 swap_phys_free(pvp
, poff
,
255 pagezero(pp
, 0, PAGESIZE
);
258 * If it's a fault ahead, release page_io_lock
259 * and SE_EXCL we grabbed in page_create_va
261 * If we are here, we haven't called fop_pageio
262 * and thus calling pvn_read_done(pp, B_READ)
263 * below may mislead that we tried i/o. Besides,
264 * in case of async, pvn_read_done() should
265 * not be called by *getpage()
269 * swap_getphysname can return error
270 * only when we are getting called from
271 * swapslot_free which passes non-NULL
284 pvn_read_done(pp
, B_ERROR
);
287 pvn_plist_init(pp
, pl
, plsz
, off
, PAGESIZE
, rw
);
289 TRACE_3(TR_FAC_SWAPFS
, TR_SWAPFS_GETAPAGE
,
290 "swapfs getapage:pp %p vp %p off %llx", pp
, vp
, off
);
295 * Called from large page anon routines only! This is an ugly hack where
296 * the anon layer directly calls into swapfs with a preallocated large page.
297 * Another method would have been to change to VOP and add an extra arg for
298 * the preallocated large page. This all could be cleaned up later when we
299 * solve the anonymous naming problem and no longer need to loop across of
300 * the VOP in PAGESIZE increments to fill in or initialize a large page as
301 * is done today. I think the latter is better since it avoid a change to
302 * the VOP interface that could later be avoided.
322 struct vnode
*pvp
= NULL
;
325 ASSERT(len
== PAGESIZE
);
327 ASSERT(plsz
== PAGESIZE
);
328 ASSERT(protp
== NULL
);
329 ASSERT(nreloc
!= NULL
);
330 ASSERT(!SEG_IS_SEGKP(seg
)); /* XXX for now not supported */
331 SWAPFS_PRINT(SWAP_VOPS
, "swap_getconpage: vp %p, off %llx, len %lx\n",
335 * If we are not using a preallocated page then we know one already
336 * exists. So just let the old code handle it.
339 err
= swap_getapage(vp
, (uoff_t
)off
, len
, protp
, pl
, plsz
,
343 ASSERT(conpp
->p_szc
!= 0);
344 ASSERT(PAGE_EXCL(conpp
));
347 ASSERT(conpp
->p_next
== conpp
);
348 ASSERT(conpp
->p_prev
== conpp
);
349 ASSERT(!PP_ISAGED(conpp
));
350 ASSERT(!PP_ISFREE(conpp
));
353 pp
= page_lookup_create(&vp
->v_object
, off
, SE_SHARED
, conpp
, nreloc
,
357 * If existing page is found we may need to relocate.
360 ASSERT(rw
!= S_CREATE
);
361 ASSERT(pszc
!= NULL
);
362 ASSERT(PAGE_SHARED(pp
));
363 if (pp
->p_szc
< conpp
->p_szc
) {
367 } else if (pp
->p_szc
> conpp
->p_szc
&&
368 seg
->s_szc
> conpp
->p_szc
) {
369 *pszc
= MIN(pp
->p_szc
, seg
->s_szc
);
375 if (page_pptonum(pp
) &
376 (page_get_pagecnt(conpp
->p_szc
) - 1))
377 cmn_err(CE_PANIC
, "swap_getconpage: no root");
382 ASSERT(PAGE_EXCL(pp
));
385 ASSERT(rw
!= S_CREATE
);
394 * If necessary do the page io.
396 if (rw
!= S_CREATE
) {
398 * Since we are only called now on behalf of an
399 * address space operation it's impossible for
400 * us to fail unlike swap_getapge() which
401 * also gets called from swapslot_free().
403 if (swap_getphysname(vp
, off
, &pvp
, &poff
)) {
405 "swap_getconpage: swap_getphysname failed!");
409 err
= fop_pageio(pvp
, pp
, poff
, PAGESIZE
, B_READ
,
415 ahm
= AH_MUTEX(vp
, off
);
417 ap
= swap_anon(vp
, off
);
419 panic("swap_getconpage: null anon");
420 if (ap
->an_pvp
!= pvp
|| ap
->an_poff
!= poff
)
421 panic("swap_getconpage: bad anon");
423 swap_phys_free(pvp
, poff
, PAGESIZE
);
430 pagezero(pp
, 0, PAGESIZE
);
435 * Normally we would let pvn_read_done() destroy
436 * the page on IO error. But since this is a preallocated
437 * page we'll let the anon layer handle it.
441 page_hashout(pp
, false);
442 ASSERT(pp
->p_next
== pp
);
443 ASSERT(pp
->p_prev
== pp
);
445 TRACE_3(TR_FAC_SWAPFS
, TR_SWAPFS_GETAPAGE
,
446 "swapfs getconpage:pp %p vp %p off %llx", pp
, vp
, off
);
453 /* Async putpage klustering stuff */
455 extern int klustsize
;
456 extern struct async_reqs
*sw_getreq();
457 extern void sw_putreq(struct async_reqs
*);
458 extern void sw_putbackreq(struct async_reqs
*);
459 extern struct async_reqs
*sw_getfree();
460 extern void sw_putfree(struct async_reqs
*);
462 static size_t swap_putpagecnt
, swap_pagespushed
;
463 static size_t swap_otherfail
, swap_otherpages
;
464 static size_t swap_klustfail
, swap_klustpages
;
465 static size_t swap_getiofail
, swap_getiopages
;
468 * Flags are composed of {B_INVAL, B_DIRTY B_FREE, B_DONTNEED}.
469 * If len == 0, do from off to EOF.
471 static int swap_nopage
= 0; /* Don't do swap_putpage's if set */
481 caller_context_t
*ct
)
488 struct async_reqs
*arg
;
493 ASSERT(vp
->v_count
!= 0);
495 nowait
= flags
& B_PAGE_NOWAIT
;
498 * Clear force flag so that p_lckcnt pages are not invalidated.
500 flags
&= ~(B_FORCE
| B_PAGE_NOWAIT
);
502 SWAPFS_PRINT(SWAP_VOPS
,
503 "swap_putpage: vp %p, off %llx len %lx, flags %x\n",
504 (void *)vp
, off
, len
, flags
, 0);
505 TRACE_3(TR_FAC_SWAPFS
, TR_SWAPFS_PUTPAGE
,
506 "swapfs putpage:vp %p off %llx len %ld", (void *)vp
, off
, len
);
508 if (vp
->v_flag
& VNOMAP
)
511 if (!vn_has_cached_data(vp
))
515 if (curproc
== proc_pageout
)
516 cmn_err(CE_PANIC
, "swapfs: pageout can't block");
518 /* Search the entire vp list for pages >= off. */
519 err
= pvn_vplist_dirty(vp
, (uoff_t
)off
, swap_putapage
,
525 * Loop over all offsets in the range [off...off + len]
526 * looking for pages to deal with.
529 for (io_off
= (uoff_t
)off
; io_off
< eoff
;
532 * If we run out of the async req slot, put the page
533 * now instead of queuing.
535 if (flags
== (B_ASYNC
| B_FREE
) &&
536 sw_pending_size
< klustsize
&&
537 (arg
= sw_getfree())) {
539 * If we are clustering, we should allow
540 * pageout to feed us more pages because # of
541 * pushes is limited by # of I/Os, and one
542 * cluster is considered to be one I/O.
549 arg
->a_len
= PAGESIZE
;
550 arg
->a_flags
= B_ASYNC
| B_FREE
;
557 * If we are not invalidating pages, use the
558 * routine page_lookup_nowait() to prevent
559 * reclaiming them from the free list.
561 if (!nowait
&& ((flags
& B_INVAL
) ||
562 (flags
& (B_ASYNC
| B_FREE
)) == B_FREE
))
563 pp
= page_lookup(&vp
->v_object
, io_off
,
566 pp
= page_lookup_nowait(&vp
->v_object
,
568 (flags
& (B_FREE
| B_INVAL
)) ? SE_EXCL
: SE_SHARED
);
570 if (pp
== NULL
|| pvn_getdirty(pp
, flags
) == 0)
573 err
= swap_putapage(vp
, pp
, &io_off
, &io_len
,
580 /* If invalidating, verify all pages on vnode list are gone. */
581 if (err
== 0 && off
== 0 && len
== 0 &&
582 (flags
& B_INVAL
) && vn_has_cached_data(vp
)) {
584 "swap_putpage: B_INVAL, pages not gone");
590 * Write out a single page.
591 * For swapfs this means choose a physical swap slot and write the page
592 * out using fop_pageio.
593 * In the (B_ASYNC | B_FREE) case we try to find a bunch of other dirty
594 * swapfs pages, a bunch of contiguous swap slots and then write them
595 * all out in one clustered i/o.
614 struct vnode
*klvp
= NULL
;
617 struct async_reqs
*arg
;
618 size_t swap_klustsize
;
621 * This check is added for callers who access swap_putpage with len = 0.
622 * swap_putpage calls swap_putapage page-by-page via pvn_vplist_dirty.
623 * And it's necessary to do the same queuing if users have the same
624 * B_ASYNC|B_FREE flags on.
626 if (flags
== (B_ASYNC
| B_FREE
) &&
627 sw_pending_size
< klustsize
&& (arg
= sw_getfree())) {
634 arg
->a_off
= pp
->p_offset
;
635 arg
->a_len
= PAGESIZE
;
636 arg
->a_flags
= B_ASYNC
| B_FREE
;
643 SWAPFS_PRINT(SWAP_PUTP
,
644 "swap_putapage: pp %p, vp %p, off %llx, flags %x\n",
645 pp
, vp
, pp
->p_offset
, flags
, 0);
647 ASSERT(PAGE_LOCKED(pp
));
654 if (err
= swap_newphysname(vp
, off
, &doff
, &dlen
, &pvp
, &poff
)) {
655 err
= (flags
== (B_ASYNC
| B_FREE
) ? ENOMEM
: 0);
666 * If this is ASYNC | FREE and we've accumulated a bunch of such
667 * pending requests, kluster.
669 if (flags
== (B_ASYNC
| B_FREE
))
670 swap_klustsize
= klustsize
;
672 swap_klustsize
= PAGESIZE
;
673 se
= (flags
& B_FREE
? SE_EXCL
: SE_SHARED
);
675 while (klsz
< swap_klustsize
) {
676 if ((arg
= sw_getreq()) == NULL
) {
678 swap_getiopages
+= btop(klsz
);
681 ASSERT(vn_matchops(arg
->a_vp
, &swap_vnodeops
));
685 if ((pp
= page_lookup_nowait(&vp
->v_object
, off
, se
)) == NULL
) {
687 swap_otherpages
+= btop(klsz
);
691 if (pvn_getdirty(pp
, flags
| B_DELWRI
) == 0) {
695 /* Get new physical backing store for the page */
698 if (err
= swap_newphysname(vp
, off
, &doff
, &dlen
,
701 swap_otherpages
+= btop(klsz
);
708 /* Try to cluster new physical name with previous ones */
709 if (klvp
== pvp
&& poff
== klstart
+ klsz
) {
711 page_add(&pplist
, pp
);
712 pplist
= pplist
->p_next
;
714 } else if (klvp
== pvp
&& poff
== klstart
- PAGESIZE
) {
717 page_add(&pplist
, pp
);
721 swap_klustpages
+= btop(klsz
);
730 err
= fop_pageio(klvp
, pplist
, klstart
, klsz
,
731 B_WRITE
| flags
, cr
, NULL
);
733 if ((flags
& B_ASYNC
) == 0)
734 pvn_write_done(pp
, ((err
) ? B_ERROR
: 0) | B_WRITE
| flags
);
739 swap_pagespushed
+= btop(klsz
);
742 TRACE_4(TR_FAC_SWAPFS
, TR_SWAPFS_PUTAPAGE
,
743 "swapfs putapage:vp %p klvp %p, klstart %lx, klsz %lx",
744 vp
, klvp
, klstart
, klsz
);
745 if (err
&& err
!= ENOMEM
)
746 cmn_err(CE_WARN
, "swapfs_putapage: err %d\n", err
);
759 caller_context_t
*ct
)
762 uoff_t off
= pp
->p_offset
;
766 ASSERT(PAGE_EXCL(pp
));
769 * The caller will free/invalidate large page in one shot instead of
770 * one small page at a time.
772 if (pp
->p_szc
!= 0) {
777 err
= swap_getphysname(vp
, off
, &pvp
, &poff
);
778 if (!err
&& pvp
!= NULL
)
779 fop_dispose(pvp
, pp
, fl
, dn
, cr
, ct
);
781 fs_dispose(vp
, pp
, fl
, dn
, cr
, ct
);