1 /* $NetBSD: uvm_swap.c,v 1.146 2009/09/13 18:45:12 pooka Exp $ */
4 * Copyright (c) 1995, 1996, 1997, 2009 Matthew R. Green
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions
10 * 1. Redistributions of source code must retain the above copyright
11 * notice, this list of conditions and the following disclaimer.
12 * 2. Redistributions in binary form must reproduce the above copyright
13 * notice, this list of conditions and the following disclaimer in the
14 * documentation and/or other materials provided with the distribution.
16 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
17 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
18 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
19 * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
20 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
21 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
22 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
23 * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
24 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
25 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
28 * from: NetBSD: vm_swap.c,v 1.52 1997/12/02 13:47:37 pk Exp
29 * from: Id: uvm_swap.c,v 1.1.2.42 1998/02/02 20:38:06 chuck Exp
32 #include <sys/cdefs.h>
33 __KERNEL_RCSID(0, "$NetBSD: uvm_swap.c,v 1.146 2009/09/13 18:45:12 pooka Exp $");
36 #include "opt_uvmhist.h"
37 #include "opt_compat_netbsd.h"
40 #include <sys/param.h>
41 #include <sys/systm.h>
46 #include <sys/namei.h>
47 #include <sys/disklabel.h>
48 #include <sys/errno.h>
49 #include <sys/kernel.h>
50 #include <sys/malloc.h>
51 #include <sys/vnode.h>
54 #include <sys/blist.h>
55 #include <sys/mount.h>
57 #include <sys/syscallargs.h>
59 #include <sys/kauth.h>
60 #include <sys/sysctl.h>
61 #include <sys/workqueue.h>
65 #include <miscfs/specfs/specdev.h>
68 * uvm_swap.c: manage configuration and i/o to swap space.
72 * swap space is managed in the following way:
74 * each swap partition or file is described by a "swapdev" structure.
75 * each "swapdev" structure contains a "swapent" structure which contains
76 * information that is passed up to the user (via system calls).
78 * each swap partition is assigned a "priority" (int) which controls
79 * swap parition usage.
81 * the system maintains a global data structure describing all swap
82 * partitions/files. there is a sorted LIST of "swappri" structures
83 * which describe "swapdev"'s at that priority. this LIST is headed
84 * by the "swap_priority" global var. each "swappri" contains a
85 * CIRCLEQ of "swapdev" structures at that priority.
88 * - swap_syscall_lock (krwlock_t): this lock serializes the swapctl
89 * system call and prevents the swap priority list from changing
90 * while we are in the middle of a system call (e.g. SWAP_STATS).
91 * - uvm_swap_data_lock (kmutex_t): this lock protects all swap data
92 * structures including the priority list, the swapdev structures,
93 * and the swapmap arena.
95 * each swap device has the following info:
96 * - swap device in use (could be disabled, preventing future use)
97 * - swap enabled (allows new allocations on swap)
98 * - map info in /dev/drum
100 * for swap files only:
102 * - max byte count in buffer
105 * userland controls and configures swap with the swapctl(2) system call.
106 * the sys_swapctl performs the following operations:
107 * [1] SWAP_NSWAP: returns the number of swap devices currently configured
108 * [2] SWAP_STATS: given a pointer to an array of swapent structures
109 * (passed in via "arg") of a size passed in via "misc" ... we load
110 * the current swap config into the array. The actual work is done
111 * in the uvm_swap_stats(9) function.
112 * [3] SWAP_ON: given a pathname in arg (could be device or file) and a
113 * priority in "misc", start swapping on it.
114 * [4] SWAP_OFF: as SWAP_ON, but stops swapping to a device
115 * [5] SWAP_CTL: changes the priority of a swap device (new priority in
120 * swapdev: describes a single swap partition/file
122 * note the following should be true:
123 * swd_inuse <= swd_nblks [number of blocks in use is <= total blocks]
124 * swd_nblks <= swd_mapsize [because mapsize includes miniroot+disklabel]
127 dev_t swd_dev
; /* device id */
128 int swd_flags
; /* flags:inuse/enable/fake */
129 int swd_priority
; /* our priority */
130 int swd_nblks
; /* blocks in this device */
131 char *swd_path
; /* saved pathname of device */
132 int swd_pathlen
; /* length of pathname */
133 int swd_npages
; /* #pages we can use */
134 int swd_npginuse
; /* #pages in use */
135 int swd_npgbad
; /* #pages bad */
136 int swd_drumoffset
; /* page0 offset in drum */
137 int swd_drumsize
; /* #pages in drum */
138 blist_t swd_blist
; /* blist for this swapdev */
139 struct vnode
*swd_vp
; /* backing vnode */
140 CIRCLEQ_ENTRY(swapdev
) swd_next
; /* priority circleq */
142 int swd_bsize
; /* blocksize (bytes) */
143 int swd_maxactive
; /* max active i/o reqs */
144 struct bufq_state
*swd_tab
; /* buffer list */
145 int swd_active
; /* number of active buffers */
149 * swap device priority entry; the list is kept sorted on `spi_priority'.
152 int spi_priority
; /* priority */
153 CIRCLEQ_HEAD(spi_swapdev
, swapdev
) spi_swapdev
;
154 /* circleq of swapdevs at this priority */
155 LIST_ENTRY(swappri
) spi_swappri
; /* global list of pri's */
159 * The following two structures are used to keep track of data transfers
160 * on swap devices associated with regular files.
161 * NOTE: this code is more or less a copy of vnd.c; we use the same
162 * structure names here to ease porting..
165 struct buf
*vx_bp
; /* Pointer to parent buffer */
166 struct swapdev
*vx_sdp
;
168 int vx_pending
; /* # of pending aux buffers */
176 struct vndxfer
*vb_xfer
;
180 * NetBSD 1.3 swapctl(SWAP_STATS, ...) swapent structure; uses 32 bit
181 * dev_t and has no se_path[] member.
184 int32_t se13_dev
; /* device id */
185 int se13_flags
; /* flags */
186 int se13_nblks
; /* total blocks */
187 int se13_inuse
; /* blocks in use */
188 int se13_priority
; /* priority of this device */
192 * NetBSD 5.0 swapctl(SWAP_STATS, ...) swapent structure; uses 32 bit
196 int32_t se50_dev
; /* device id */
197 int se50_flags
; /* flags */
198 int se50_nblks
; /* total blocks */
199 int se50_inuse
; /* blocks in use */
200 int se50_priority
; /* priority of this device */
201 char se50_path
[PATH_MAX
+1]; /* path name */
205 * We keep a of pool vndbuf's and vndxfer structures.
207 static struct pool vndxfer_pool
, vndbuf_pool
;
212 MALLOC_DEFINE(M_VMSWAP
, "VM swap", "VM swap structures");
213 static vmem_t
*swapmap
; /* controls the mapping of /dev/drum */
215 /* list of all active swap devices [by priority] */
216 LIST_HEAD(swap_priority
, swappri
);
217 static struct swap_priority swap_priority
;
220 static krwlock_t swap_syscall_lock
;
222 /* workqueue and use counter for swap to regular files */
223 static int sw_reg_count
= 0;
224 static struct workqueue
*sw_reg_workqueue
;
227 u_int uvm_swapisfull_factor
= 99;
232 static struct swapdev
*swapdrum_getsdp(int);
234 static struct swapdev
*swaplist_find(struct vnode
*, bool);
235 static void swaplist_insert(struct swapdev
*,
236 struct swappri
*, int);
237 static void swaplist_trim(void);
239 static int swap_on(struct lwp
*, struct swapdev
*);
240 static int swap_off(struct lwp
*, struct swapdev
*);
242 static void uvm_swap_stats_locked(int, struct swapent
*, int, register_t
*);
244 static void sw_reg_strategy(struct swapdev
*, struct buf
*, int);
245 static void sw_reg_biodone(struct buf
*);
246 static void sw_reg_iodone(struct work
*wk
, void *dummy
);
247 static void sw_reg_start(struct swapdev
*);
249 static int uvm_swap_io(struct vm_page
**, int, int, int);
252 * uvm_swap_init: init the swap system data structures and locks
254 * => called at boot time from init_main.c after the filesystems
255 * are brought up (which happens after uvm_init())
260 UVMHIST_FUNC("uvm_swap_init");
262 UVMHIST_CALLED(pdhist
);
264 * first, init the swap list, its counter, and its lock.
265 * then get a handle on the vnode for /dev/drum by using
266 * the its dev_t number ("swapdev", from MD conf.c).
269 LIST_INIT(&swap_priority
);
271 rw_init(&swap_syscall_lock
);
272 mutex_init(&uvm_swap_data_lock
, MUTEX_DEFAULT
, IPL_NONE
);
274 if (bdevvp(swapdev
, &swapdev_vp
))
275 panic("%s: can't get vnode for swap device", __func__
);
276 if (vn_lock(swapdev_vp
, LK_EXCLUSIVE
| LK_RETRY
))
277 panic("%s: can't lock swap device", __func__
);
278 if (VOP_OPEN(swapdev_vp
, FREAD
| FWRITE
, NOCRED
))
279 panic("%s: can't open swap device", __func__
);
280 VOP_UNLOCK(swapdev_vp
, 0);
283 * create swap block resource map to map /dev/drum. the range
284 * from 1 to INT_MAX allows 2 gigablocks of swap space. note
285 * that block 0 is reserved (used to indicate an allocation
286 * failure, or no allocation).
288 swapmap
= vmem_create("swapmap", 1, INT_MAX
- 1, 1, NULL
, NULL
, NULL
, 0,
289 VM_NOSLEEP
, IPL_NONE
);
291 panic("%s: vmem_create failed", __func__
);
294 pool_init(&vndxfer_pool
, sizeof(struct vndxfer
), 0, 0, 0, "swp vnx",
296 pool_init(&vndbuf_pool
, sizeof(struct vndbuf
), 0, 0, 0, "swp vnd",
299 UVMHIST_LOG(pdhist
, "<- done", 0, 0, 0, 0);
303 * swaplist functions: functions that operate on the list of swap
304 * devices on the system.
308 * swaplist_insert: insert swap device "sdp" into the global list
310 * => caller must hold both swap_syscall_lock and uvm_swap_data_lock
311 * => caller must provide a newly malloc'd swappri structure (we will
312 * FREE it if we don't need it... this it to prevent malloc blocking
313 * here while adding swap)
316 swaplist_insert(struct swapdev
*sdp
, struct swappri
*newspp
, int priority
)
318 struct swappri
*spp
, *pspp
;
319 UVMHIST_FUNC("swaplist_insert"); UVMHIST_CALLED(pdhist
);
322 * find entry at or after which to insert the new device.
325 LIST_FOREACH(spp
, &swap_priority
, spi_swappri
) {
326 if (priority
<= spp
->spi_priority
)
334 if (spp
== NULL
|| spp
->spi_priority
!= priority
) {
335 spp
= newspp
; /* use newspp! */
336 UVMHIST_LOG(pdhist
, "created new swappri = %d",
339 spp
->spi_priority
= priority
;
340 CIRCLEQ_INIT(&spp
->spi_swapdev
);
343 LIST_INSERT_AFTER(pspp
, spp
, spi_swappri
);
345 LIST_INSERT_HEAD(&swap_priority
, spp
, spi_swappri
);
347 /* we don't need a new priority structure, free it */
348 free(newspp
, M_VMSWAP
);
352 * priority found (or created). now insert on the priority's
353 * circleq list and bump the total number of swapdevs.
355 sdp
->swd_priority
= priority
;
356 CIRCLEQ_INSERT_TAIL(&spp
->spi_swapdev
, sdp
, swd_next
);
361 * swaplist_find: find and optionally remove a swap device from the
364 * => caller must hold both swap_syscall_lock and uvm_swap_data_lock
365 * => we return the swapdev we found (and removed)
367 static struct swapdev
*
368 swaplist_find(struct vnode
*vp
, bool remove
)
374 * search the lists for the requested vp
377 LIST_FOREACH(spp
, &swap_priority
, spi_swappri
) {
378 CIRCLEQ_FOREACH(sdp
, &spp
->spi_swapdev
, swd_next
) {
379 if (sdp
->swd_vp
== vp
) {
381 CIRCLEQ_REMOVE(&spp
->spi_swapdev
,
393 * swaplist_trim: scan priority list for empty priority entries and kill
396 * => caller must hold both swap_syscall_lock and uvm_swap_data_lock
401 struct swappri
*spp
, *nextspp
;
403 for (spp
= LIST_FIRST(&swap_priority
); spp
!= NULL
; spp
= nextspp
) {
404 nextspp
= LIST_NEXT(spp
, spi_swappri
);
405 if (CIRCLEQ_FIRST(&spp
->spi_swapdev
) !=
406 (void *)&spp
->spi_swapdev
)
408 LIST_REMOVE(spp
, spi_swappri
);
414 * swapdrum_getsdp: given a page offset in /dev/drum, convert it back
415 * to the "swapdev" that maps that section of the drum.
417 * => each swapdev takes one big contig chunk of the drum
418 * => caller must hold uvm_swap_data_lock
420 static struct swapdev
*
421 swapdrum_getsdp(int pgno
)
426 LIST_FOREACH(spp
, &swap_priority
, spi_swappri
) {
427 CIRCLEQ_FOREACH(sdp
, &spp
->spi_swapdev
, swd_next
) {
428 if (sdp
->swd_flags
& SWF_FAKE
)
430 if (pgno
>= sdp
->swd_drumoffset
&&
431 pgno
< (sdp
->swd_drumoffset
+ sdp
->swd_drumsize
)) {
441 * sys_swapctl: main entry point for swapctl(2) system call
442 * [with two helper functions: swap_on and swap_off]
445 sys_swapctl(struct lwp
*l
, const struct sys_swapctl_args
*uap
, register_t
*retval
)
449 syscallarg(void *) arg;
450 syscallarg(int) misc;
457 #define SWAP_PATH_MAX (PATH_MAX + 1)
462 UVMHIST_FUNC("sys_swapctl"); UVMHIST_CALLED(pdhist
);
464 misc
= SCARG(uap
, misc
);
467 * ensure serialized syscall access by grabbing the swap_syscall_lock
469 rw_enter(&swap_syscall_lock
, RW_WRITER
);
471 userpath
= malloc(SWAP_PATH_MAX
, M_TEMP
, M_WAITOK
);
473 * we handle the non-priv NSWAP and STATS request first.
475 * SWAP_NSWAP: return number of config'd swap devices
476 * [can also be obtained with uvmexp sysctl]
478 if (SCARG(uap
, cmd
) == SWAP_NSWAP
) {
479 UVMHIST_LOG(pdhist
, "<- done SWAP_NSWAP=%d", uvmexp
.nswapdev
,
481 *retval
= uvmexp
.nswapdev
;
487 * SWAP_STATS: get stats on current # of configured swap devs
489 * note that the swap_priority list can't change as long
490 * as we are holding the swap_syscall_lock. we don't want
491 * to grab the uvm_swap_data_lock because we may fault&sleep during
492 * copyout() and we don't want to be holding that lock then!
494 if (SCARG(uap
, cmd
) == SWAP_STATS
495 #if defined(COMPAT_50)
496 || SCARG(uap
, cmd
) == SWAP_STATS50
498 #if defined(COMPAT_13)
499 || SCARG(uap
, cmd
) == SWAP_STATS13
502 if ((size_t)misc
> (size_t)uvmexp
.nswapdev
)
503 misc
= uvmexp
.nswapdev
;
504 #if defined(COMPAT_13)
505 if (SCARG(uap
, cmd
) == SWAP_STATS13
)
506 len
= sizeof(struct swapent13
) * misc
;
509 #if defined(COMPAT_50)
510 if (SCARG(uap
, cmd
) == SWAP_STATS50
)
511 len
= sizeof(struct swapent50
) * misc
;
514 len
= sizeof(struct swapent
) * misc
;
515 sep
= (struct swapent
*)malloc(len
, M_TEMP
, M_WAITOK
);
517 uvm_swap_stats_locked(SCARG(uap
, cmd
), sep
, misc
, retval
);
518 error
= copyout(sep
, SCARG(uap
, arg
), len
);
521 UVMHIST_LOG(pdhist
, "<- done SWAP_STATS", 0, 0, 0, 0);
524 if (SCARG(uap
, cmd
) == SWAP_GETDUMPDEV
) {
525 dev_t
*devp
= (dev_t
*)SCARG(uap
, arg
);
527 error
= copyout(&dumpdev
, devp
, sizeof(dumpdev
));
532 * all other requests require superuser privs. verify.
534 if ((error
= kauth_authorize_system(l
->l_cred
, KAUTH_SYSTEM_SWAPCTL
,
535 0, NULL
, NULL
, NULL
)))
538 if (SCARG(uap
, cmd
) == SWAP_DUMPOFF
) {
539 /* drop the current dump device */
547 * at this point we expect a path name in arg. we will
548 * use namei() to gain a vnode reference (vref), and lock
549 * the vnode (VOP_LOCK).
551 * XXX: a NULL arg means use the root vnode pointer (e.g. for
554 if (SCARG(uap
, arg
) == NULL
) {
555 vp
= rootvp
; /* miniroot */
556 if (vget(vp
, LK_EXCLUSIVE
)) {
560 if (SCARG(uap
, cmd
) == SWAP_ON
&&
561 copystr("miniroot", userpath
, SWAP_PATH_MAX
, &len
))
562 panic("swapctl: miniroot copy failed");
567 if (SCARG(uap
, cmd
) == SWAP_ON
) {
568 if ((error
= copyinstr(SCARG(uap
, arg
), userpath
,
569 SWAP_PATH_MAX
, &len
)))
571 space
= UIO_SYSSPACE
;
574 space
= UIO_USERSPACE
;
575 where
= (char *)SCARG(uap
, arg
);
577 NDINIT(&nd
, LOOKUP
, FOLLOW
| LOCKLEAF
| TRYEMULROOT
,
579 if ((error
= namei(&nd
)))
583 /* note: "vp" is referenced and locked */
585 error
= 0; /* assume no error */
586 switch(SCARG(uap
, cmd
)) {
589 if (vp
->v_type
!= VBLK
) {
593 if (bdevsw_lookup(vp
->v_rdev
)) {
594 dumpdev
= vp
->v_rdev
;
595 dumpcdev
= devsw_blk2chr(dumpdev
);
603 * get new priority, remove old entry (if any) and then
604 * reinsert it in the correct place. finally, prune out
605 * any empty priority structures.
607 priority
= SCARG(uap
, misc
);
608 spp
= malloc(sizeof *spp
, M_VMSWAP
, M_WAITOK
);
609 mutex_enter(&uvm_swap_data_lock
);
610 if ((sdp
= swaplist_find(vp
, true)) == NULL
) {
613 swaplist_insert(sdp
, spp
, priority
);
616 mutex_exit(&uvm_swap_data_lock
);
624 * check for duplicates. if none found, then insert a
625 * dummy entry on the list to prevent someone else from
626 * trying to enable this device while we are working on
630 priority
= SCARG(uap
, misc
);
631 sdp
= malloc(sizeof *sdp
, M_VMSWAP
, M_WAITOK
);
632 spp
= malloc(sizeof *spp
, M_VMSWAP
, M_WAITOK
);
633 memset(sdp
, 0, sizeof(*sdp
));
634 sdp
->swd_flags
= SWF_FAKE
;
636 sdp
->swd_dev
= (vp
->v_type
== VBLK
) ? vp
->v_rdev
: NODEV
;
637 bufq_alloc(&sdp
->swd_tab
, "disksort", BUFQ_SORT_RAWBLOCK
);
638 mutex_enter(&uvm_swap_data_lock
);
639 if (swaplist_find(vp
, false) != NULL
) {
641 mutex_exit(&uvm_swap_data_lock
);
642 bufq_free(sdp
->swd_tab
);
647 swaplist_insert(sdp
, spp
, priority
);
648 mutex_exit(&uvm_swap_data_lock
);
650 sdp
->swd_pathlen
= len
;
651 sdp
->swd_path
= malloc(sdp
->swd_pathlen
, M_VMSWAP
, M_WAITOK
);
652 if (copystr(userpath
, sdp
->swd_path
, sdp
->swd_pathlen
, 0) != 0)
653 panic("swapctl: copystr");
656 * we've now got a FAKE placeholder in the swap list.
657 * now attempt to enable swap on it. if we fail, undo
658 * what we've done and kill the fake entry we just inserted.
659 * if swap_on is a success, it will clear the SWF_FAKE flag
662 if ((error
= swap_on(l
, sdp
)) != 0) {
663 mutex_enter(&uvm_swap_data_lock
);
664 (void) swaplist_find(vp
, true); /* kill fake entry */
666 mutex_exit(&uvm_swap_data_lock
);
667 bufq_free(sdp
->swd_tab
);
668 free(sdp
->swd_path
, M_VMSWAP
);
675 mutex_enter(&uvm_swap_data_lock
);
676 if ((sdp
= swaplist_find(vp
, false)) == NULL
) {
677 mutex_exit(&uvm_swap_data_lock
);
683 * If a device isn't in use or enabled, we
684 * can't stop swapping from it (again).
686 if ((sdp
->swd_flags
& (SWF_INUSE
|SWF_ENABLE
)) == 0) {
687 mutex_exit(&uvm_swap_data_lock
);
695 error
= swap_off(l
, sdp
);
703 * done! release the ref gained by namei() and unlock.
708 free(userpath
, M_TEMP
);
709 rw_exit(&swap_syscall_lock
);
711 UVMHIST_LOG(pdhist
, "<- done! error=%d", error
, 0, 0, 0);
716 * swap_stats: implements swapctl(SWAP_STATS). The function is kept
717 * away from sys_swapctl() in order to allow COMPAT_* swapctl()
718 * emulation to use it directly without going through sys_swapctl().
719 * The problem with using sys_swapctl() there is that it involves
720 * copying the swapent array to the stackgap, and this array's size
721 * is not known at build time. Hence it would not be possible to
722 * ensure it would fit in the stackgap in any case.
725 uvm_swap_stats(int cmd
, struct swapent
*sep
, int sec
, register_t
*retval
)
728 rw_enter(&swap_syscall_lock
, RW_READER
);
729 uvm_swap_stats_locked(cmd
, sep
, sec
, retval
);
730 rw_exit(&swap_syscall_lock
);
734 uvm_swap_stats_locked(int cmd
, struct swapent
*sep
, int sec
, register_t
*retval
)
740 LIST_FOREACH(spp
, &swap_priority
, spi_swappri
) {
741 for (sdp
= CIRCLEQ_FIRST(&spp
->spi_swapdev
);
742 sdp
!= (void *)&spp
->spi_swapdev
&& sec
-- > 0;
743 sdp
= CIRCLEQ_NEXT(sdp
, swd_next
)) {
747 * backwards compatibility for system call.
748 * For NetBSD 1.3 and 5.0, we have to use
749 * the 32 bit dev_t. For 5.0 and -current
750 * we have to add the path.
752 inuse
= btodb((uint64_t)sdp
->swd_npginuse
<<
755 #if defined(COMPAT_13) || defined(COMPAT_50)
756 if (cmd
== SWAP_STATS
) {
758 sep
->se_dev
= sdp
->swd_dev
;
759 sep
->se_flags
= sdp
->swd_flags
;
760 sep
->se_nblks
= sdp
->swd_nblks
;
761 sep
->se_inuse
= inuse
;
762 sep
->se_priority
= sdp
->swd_priority
;
763 memcpy(&sep
->se_path
, sdp
->swd_path
,
764 sizeof sep
->se_path
);
766 #if defined(COMPAT_13)
767 } else if (cmd
== SWAP_STATS13
) {
768 struct swapent13
*sep13
=
769 (struct swapent13
*)sep
;
771 sep13
->se13_dev
= sdp
->swd_dev
;
772 sep13
->se13_flags
= sdp
->swd_flags
;
773 sep13
->se13_nblks
= sdp
->swd_nblks
;
774 sep13
->se13_inuse
= inuse
;
775 sep13
->se13_priority
= sdp
->swd_priority
;
776 sep
= (struct swapent
*)(sep13
+ 1);
778 #if defined(COMPAT_50)
779 } else if (cmd
== SWAP_STATS50
) {
780 struct swapent50
*sep50
=
781 (struct swapent50
*)sep
;
783 sep50
->se50_dev
= sdp
->swd_dev
;
784 sep50
->se50_flags
= sdp
->swd_flags
;
785 sep50
->se50_nblks
= sdp
->swd_nblks
;
786 sep50
->se50_inuse
= inuse
;
787 sep50
->se50_priority
= sdp
->swd_priority
;
788 memcpy(&sep50
->se50_path
, sdp
->swd_path
,
789 sizeof sep50
->se50_path
);
790 sep
= (struct swapent
*)(sep50
+ 1);
802 * swap_on: attempt to enable a swapdev for swapping. note that the
803 * swapdev is already on the global list, but disabled (marked
806 * => we avoid the start of the disk (to protect disk labels)
807 * => we also avoid the miniroot, if we are swapping to root.
808 * => caller should leave uvm_swap_data_lock unlocked, we may lock it
812 swap_on(struct lwp
*l
, struct swapdev
*sdp
)
815 int error
, npages
, nblocks
, size
;
820 extern int (**nfsv2_vnodeop_p
)(void *);
822 const struct bdevsw
*bdev
;
824 UVMHIST_FUNC("swap_on"); UVMHIST_CALLED(pdhist
);
827 * we want to enable swapping on sdp. the swd_vp contains
828 * the vnode we want (locked and ref'd), and the swd_dev
829 * contains the dev_t of the file, if it a block device.
836 * open the swap file (mostly useful for block device files to
837 * let device driver know what is up).
839 * we skip the open/close for root on swap because the root
840 * has already been opened when root was mounted (mountroot).
843 if ((error
= VOP_OPEN(vp
, FREAD
|FWRITE
, l
->l_cred
)))
847 /* XXX this only works for block devices */
848 UVMHIST_LOG(pdhist
, " dev=%d, major(dev)=%d", dev
, major(dev
), 0,0);
851 * we now need to determine the size of the swap area. for
852 * block specials we can call the d_psize function.
853 * for normal files, we must stat [get attrs].
855 * we put the result in nblks.
856 * for normal files, we also want the filesystem block size
857 * (which we get with statfs).
859 switch (vp
->v_type
) {
861 bdev
= bdevsw_lookup(dev
);
862 if (bdev
== NULL
|| bdev
->d_psize
== NULL
||
863 (nblocks
= (*bdev
->d_psize
)(dev
)) == -1) {
870 if ((error
= VOP_GETATTR(vp
, &va
, l
->l_cred
)))
872 nblocks
= (int)btodb(va
.va_size
);
874 VFS_STATVFS(vp
->v_mount
, &vp
->v_mount
->mnt_stat
)) != 0)
877 sdp
->swd_bsize
= vp
->v_mount
->mnt_stat
.f_iosize
;
879 * limit the max # of outstanding I/O requests we issue
880 * at any one time. take it easy on NFS servers.
883 if (vp
->v_op
== nfsv2_vnodeop_p
)
884 sdp
->swd_maxactive
= 2; /* XXX */
887 sdp
->swd_maxactive
= 8; /* XXX */
896 * save nblocks in a safe place and convert to pages.
899 sdp
->swd_nblks
= nblocks
;
900 npages
= dbtob((uint64_t)nblocks
) >> PAGE_SHIFT
;
903 * for block special files, we want to make sure that leave
904 * the disklabel and bootblocks alone, so we arrange to skip
905 * over them (arbitrarily choosing to skip PAGE_SIZE bytes).
906 * note that because of this the "size" can be less than the
907 * actual number of blocks on the device.
909 if (vp
->v_type
== VBLK
) {
910 /* we use pages 1 to (size - 1) [inclusive] */
914 /* we use pages 0 to (size - 1) [inclusive] */
920 * make sure we have enough blocks for a reasonable sized swap
921 * area. we want at least one page.
925 UVMHIST_LOG(pdhist
, " size <= 1!!", 0, 0, 0, 0);
930 UVMHIST_LOG(pdhist
, " dev=%x: size=%d addr=%ld\n", dev
, size
, addr
, 0);
933 * now we need to allocate an extent to manage this swap device
936 sdp
->swd_blist
= blist_create(npages
);
937 /* mark all expect the `saved' region free. */
938 blist_free(sdp
->swd_blist
, addr
, size
);
941 * if the vnode we are swapping to is the root vnode
942 * (i.e. we are swapping to the miniroot) then we want
943 * to make sure we don't overwrite it. do a statfs to
944 * find its size and skip over it.
949 int rootblocks
, rootpages
;
951 mp
= rootvnode
->v_mount
;
953 rootblocks
= sp
->f_blocks
* btodb(sp
->f_frsize
);
955 * XXX: sp->f_blocks isn't the total number of
956 * blocks in the filesystem, it's the number of
957 * data blocks. so, our rootblocks almost
958 * definitely underestimates the total size
959 * of the filesystem - how badly depends on the
960 * details of the filesystem type. there isn't
961 * an obvious way to deal with this cleanly
962 * and perfectly, so for now we just pad our
963 * rootblocks estimate with an extra 5 percent.
965 rootblocks
+= (rootblocks
>> 5) +
968 rootpages
= round_page(dbtob(rootblocks
)) >> PAGE_SHIFT
;
969 if (rootpages
> size
)
970 panic("swap_on: miniroot larger than swap?");
972 if (rootpages
!= blist_fill(sdp
->swd_blist
, addr
, rootpages
)) {
973 panic("swap_on: unable to preserve miniroot");
977 printf("Preserved %d pages of miniroot ", rootpages
);
978 printf("leaving %d pages of swap\n", size
);
982 * add a ref to vp to reflect usage as a swap device.
987 * now add the new swapdev to the drum and enable.
989 result
= vmem_alloc(swapmap
, npages
, VM_BESTFIT
| VM_SLEEP
);
991 panic("swapdrum_add");
993 * If this is the first regular swap create the workqueue.
994 * => Protected by swap_syscall_lock.
996 if (vp
->v_type
!= VBLK
) {
997 if (sw_reg_count
++ == 0) {
998 KASSERT(sw_reg_workqueue
== NULL
);
999 if (workqueue_create(&sw_reg_workqueue
, "swapiod",
1000 sw_reg_iodone
, NULL
, PRIBIO
, IPL_BIO
, 0) != 0)
1001 panic("%s: workqueue_create failed", __func__
);
1005 sdp
->swd_drumoffset
= (int)result
;
1006 sdp
->swd_drumsize
= npages
;
1007 sdp
->swd_npages
= size
;
1008 mutex_enter(&uvm_swap_data_lock
);
1009 sdp
->swd_flags
&= ~SWF_FAKE
; /* going live */
1010 sdp
->swd_flags
|= (SWF_INUSE
|SWF_ENABLE
);
1011 uvmexp
.swpages
+= size
;
1012 uvmexp
.swpgavail
+= size
;
1013 mutex_exit(&uvm_swap_data_lock
);
1017 * failure: clean up and return error.
1021 if (sdp
->swd_blist
) {
1022 blist_destroy(sdp
->swd_blist
);
1025 (void)VOP_CLOSE(vp
, FREAD
|FWRITE
, l
->l_cred
);
1031 * swap_off: stop swapping on swapdev
1033 * => swap data should be locked, we will unlock.
1036 swap_off(struct lwp
*l
, struct swapdev
*sdp
)
1038 int npages
= sdp
->swd_npages
;
1041 UVMHIST_FUNC("swap_off"); UVMHIST_CALLED(pdhist
);
1042 UVMHIST_LOG(pdhist
, " dev=%x, npages=%d", sdp
->swd_dev
,npages
,0,0);
1044 /* disable the swap area being removed */
1045 sdp
->swd_flags
&= ~SWF_ENABLE
;
1046 uvmexp
.swpgavail
-= npages
;
1047 mutex_exit(&uvm_swap_data_lock
);
1050 * the idea is to find all the pages that are paged out to this
1051 * device, and page them all in. in uvm, swap-backed pageable
1052 * memory can take two forms: aobjs and anons. call the
1053 * swapoff hook for each subsystem to bring in pages.
1056 if (uao_swap_off(sdp
->swd_drumoffset
,
1057 sdp
->swd_drumoffset
+ sdp
->swd_drumsize
) ||
1058 amap_swap_off(sdp
->swd_drumoffset
,
1059 sdp
->swd_drumoffset
+ sdp
->swd_drumsize
)) {
1061 } else if (sdp
->swd_npginuse
> sdp
->swd_npgbad
) {
1066 mutex_enter(&uvm_swap_data_lock
);
1067 sdp
->swd_flags
|= SWF_ENABLE
;
1068 uvmexp
.swpgavail
+= npages
;
1069 mutex_exit(&uvm_swap_data_lock
);
1075 * If this is the last regular swap destroy the workqueue.
1076 * => Protected by swap_syscall_lock.
1078 if (sdp
->swd_vp
->v_type
!= VBLK
) {
1079 KASSERT(sw_reg_count
> 0);
1080 KASSERT(sw_reg_workqueue
!= NULL
);
1081 if (--sw_reg_count
== 0) {
1082 workqueue_destroy(sw_reg_workqueue
);
1083 sw_reg_workqueue
= NULL
;
1088 * done with the vnode.
1089 * drop our ref on the vnode before calling VOP_CLOSE()
1090 * so that spec_close() can tell if this is the last close.
1093 if (sdp
->swd_vp
!= rootvp
) {
1094 (void) VOP_CLOSE(sdp
->swd_vp
, FREAD
|FWRITE
, l
->l_cred
);
1097 mutex_enter(&uvm_swap_data_lock
);
1098 uvmexp
.swpages
-= npages
;
1099 uvmexp
.swpginuse
-= sdp
->swd_npgbad
;
1101 if (swaplist_find(sdp
->swd_vp
, true) == NULL
)
1102 panic("%s: swapdev not in list", __func__
);
1104 mutex_exit(&uvm_swap_data_lock
);
1107 * free all resources!
1109 vmem_free(swapmap
, sdp
->swd_drumoffset
, sdp
->swd_drumsize
);
1110 blist_destroy(sdp
->swd_blist
);
1111 bufq_free(sdp
->swd_tab
);
1112 free(sdp
, M_VMSWAP
);
1117 * /dev/drum interface and i/o functions
1121 * swstrategy: perform I/O on the drum
1123 * => we must map the i/o request from the drum to the correct swapdev.
1126 swstrategy(struct buf
*bp
)
1128 struct swapdev
*sdp
;
1131 UVMHIST_FUNC("swstrategy"); UVMHIST_CALLED(pdhist
);
1134 * convert block number to swapdev. note that swapdev can't
1135 * be yanked out from under us because we are holding resources
1136 * in it (i.e. the blocks we are doing I/O on).
1138 pageno
= dbtob((int64_t)bp
->b_blkno
) >> PAGE_SHIFT
;
1139 mutex_enter(&uvm_swap_data_lock
);
1140 sdp
= swapdrum_getsdp(pageno
);
1141 mutex_exit(&uvm_swap_data_lock
);
1143 bp
->b_error
= EINVAL
;
1145 UVMHIST_LOG(pdhist
, " failed to get swap device", 0, 0, 0, 0);
1150 * convert drum page number to block number on this swapdev.
1153 pageno
-= sdp
->swd_drumoffset
; /* page # on swapdev */
1154 bn
= btodb((uint64_t)pageno
<< PAGE_SHIFT
); /* convert to diskblock */
1156 UVMHIST_LOG(pdhist
, " %s: mapoff=%x bn=%x bcount=%ld",
1157 ((bp
->b_flags
& B_READ
) == 0) ? "write" : "read",
1158 sdp
->swd_drumoffset
, bn
, bp
->b_bcount
);
1161 * for block devices we finish up here.
1162 * for regular files we have to do more work which we delegate
1163 * to sw_reg_strategy().
1166 vp
= sdp
->swd_vp
; /* swapdev vnode pointer */
1167 switch (vp
->v_type
) {
1169 panic("%s: vnode type 0x%x", __func__
, vp
->v_type
);
1174 * must convert "bp" from an I/O on /dev/drum to an I/O
1175 * on the swapdev (sdp).
1177 bp
->b_blkno
= bn
; /* swapdev block number */
1178 bp
->b_dev
= sdp
->swd_dev
; /* swapdev dev_t */
1181 * if we are doing a write, we have to redirect the i/o on
1182 * drum's v_numoutput counter to the swapdevs.
1184 if ((bp
->b_flags
& B_READ
) == 0) {
1185 mutex_enter(bp
->b_objlock
);
1186 vwakeup(bp
); /* kills one 'v_numoutput' on drum */
1187 mutex_exit(bp
->b_objlock
);
1188 mutex_enter(&vp
->v_interlock
);
1189 vp
->v_numoutput
++; /* put it on swapdev */
1190 mutex_exit(&vp
->v_interlock
);
1194 * finally plug in swapdev vnode and start I/O
1197 bp
->b_objlock
= &vp
->v_interlock
;
1198 VOP_STRATEGY(vp
, bp
);
1203 * delegate to sw_reg_strategy function.
1205 sw_reg_strategy(sdp
, bp
, bn
);
1212 * swread: the read function for the drum (just a call to physio)
1216 swread(dev_t dev
, struct uio
*uio
, int ioflag
)
1218 UVMHIST_FUNC("swread"); UVMHIST_CALLED(pdhist
);
1220 UVMHIST_LOG(pdhist
, " dev=%x offset=%qx", dev
, uio
->uio_offset
, 0, 0);
1221 return (physio(swstrategy
, NULL
, dev
, B_READ
, minphys
, uio
));
1225 * swwrite: the write function for the drum (just a call to physio)
1229 swwrite(dev_t dev
, struct uio
*uio
, int ioflag
)
1231 UVMHIST_FUNC("swwrite"); UVMHIST_CALLED(pdhist
);
1233 UVMHIST_LOG(pdhist
, " dev=%x offset=%qx", dev
, uio
->uio_offset
, 0, 0);
1234 return (physio(swstrategy
, NULL
, dev
, B_WRITE
, minphys
, uio
));
1237 const struct bdevsw swap_bdevsw
= {
1238 nullopen
, nullclose
, swstrategy
, noioctl
, nodump
, nosize
, D_OTHER
,
1241 const struct cdevsw swap_cdevsw
= {
1242 nullopen
, nullclose
, swread
, swwrite
, noioctl
,
1243 nostop
, notty
, nopoll
, nommap
, nokqfilter
, D_OTHER
,
1247 * sw_reg_strategy: handle swap i/o to regular files
1250 sw_reg_strategy(struct swapdev
*sdp
, struct buf
*bp
, int bn
)
1253 struct vndxfer
*vnx
;
1257 int s
, off
, nra
, error
, sz
, resid
;
1258 UVMHIST_FUNC("sw_reg_strategy"); UVMHIST_CALLED(pdhist
);
1261 * allocate a vndxfer head for this transfer and point it to
1264 vnx
= pool_get(&vndxfer_pool
, PR_WAITOK
);
1265 vnx
->vx_flags
= VX_BUSY
;
1267 vnx
->vx_pending
= 0;
1272 * setup for main loop where we read filesystem blocks into
1276 bp
->b_resid
= bp
->b_bcount
; /* nothing transfered yet! */
1277 addr
= bp
->b_data
; /* current position in buffer */
1278 byteoff
= dbtob((uint64_t)bn
);
1280 for (resid
= bp
->b_resid
; resid
; resid
-= sz
) {
1284 * translate byteoffset into block number. return values:
1285 * vp = vnode of underlying device
1286 * nbn = new block number (on underlying vnode dev)
1287 * nra = num blocks we can read-ahead (excludes requested
1291 error
= VOP_BMAP(sdp
->swd_vp
, byteoff
/ sdp
->swd_bsize
,
1294 if (error
== 0 && nbn
== (daddr_t
)-1) {
1296 * this used to just set error, but that doesn't
1297 * do the right thing. Instead, it causes random
1298 * memory errors. The panic() should remain until
1299 * this condition doesn't destabilize the system.
1302 panic("%s: swap to sparse file", __func__
);
1304 error
= EIO
; /* failure */
1309 * punt if there was an error or a hole in the file.
1310 * we must wait for any i/o ops we have already started
1311 * to finish before returning.
1313 * XXX we could deal with holes here but it would be
1314 * a hassle (in the write case).
1318 vnx
->vx_error
= error
; /* pass error up */
1323 * compute the size ("sz") of this transfer (in bytes).
1325 off
= byteoff
% sdp
->swd_bsize
;
1326 sz
= (1 + nra
) * sdp
->swd_bsize
- off
;
1330 UVMHIST_LOG(pdhist
, "sw_reg_strategy: "
1331 "vp %p/%p offset 0x%x/0x%x",
1332 sdp
->swd_vp
, vp
, byteoff
, nbn
);
1335 * now get a buf structure. note that the vb_buf is
1336 * at the front of the nbp structure so that you can
1337 * cast pointers between the two structure easily.
1339 nbp
= pool_get(&vndbuf_pool
, PR_WAITOK
);
1340 buf_init(&nbp
->vb_buf
);
1341 nbp
->vb_buf
.b_flags
= bp
->b_flags
;
1342 nbp
->vb_buf
.b_cflags
= bp
->b_cflags
;
1343 nbp
->vb_buf
.b_oflags
= bp
->b_oflags
;
1344 nbp
->vb_buf
.b_bcount
= sz
;
1345 nbp
->vb_buf
.b_bufsize
= sz
;
1346 nbp
->vb_buf
.b_error
= 0;
1347 nbp
->vb_buf
.b_data
= addr
;
1348 nbp
->vb_buf
.b_lblkno
= 0;
1349 nbp
->vb_buf
.b_blkno
= nbn
+ btodb(off
);
1350 nbp
->vb_buf
.b_rawblkno
= nbp
->vb_buf
.b_blkno
;
1351 nbp
->vb_buf
.b_iodone
= sw_reg_biodone
;
1352 nbp
->vb_buf
.b_vp
= vp
;
1353 nbp
->vb_buf
.b_objlock
= &vp
->v_interlock
;
1354 if (vp
->v_type
== VBLK
) {
1355 nbp
->vb_buf
.b_dev
= vp
->v_rdev
;
1358 nbp
->vb_xfer
= vnx
; /* patch it back in to vnx */
1361 * Just sort by block number
1364 if (vnx
->vx_error
!= 0) {
1365 buf_destroy(&nbp
->vb_buf
);
1366 pool_put(&vndbuf_pool
, nbp
);
1371 /* sort it in and start I/O if we are not over our limit */
1373 bufq_put(sdp
->swd_tab
, &nbp
->vb_buf
);
1378 * advance to the next I/O
1386 out
: /* Arrive here at splbio */
1387 vnx
->vx_flags
&= ~VX_BUSY
;
1388 if (vnx
->vx_pending
== 0) {
1389 error
= vnx
->vx_error
;
1390 pool_put(&vndxfer_pool
, vnx
);
1391 bp
->b_error
= error
;
1398 * sw_reg_start: start an I/O request on the requested swapdev
1400 * => reqs are sorted by b_rawblkno (above)
1403 sw_reg_start(struct swapdev
*sdp
)
1407 UVMHIST_FUNC("sw_reg_start"); UVMHIST_CALLED(pdhist
);
1409 /* recursion control */
1410 if ((sdp
->swd_flags
& SWF_BUSY
) != 0)
1413 sdp
->swd_flags
|= SWF_BUSY
;
1415 while (sdp
->swd_active
< sdp
->swd_maxactive
) {
1416 bp
= bufq_get(sdp
->swd_tab
);
1422 "sw_reg_start: bp %p vp %p blkno %p cnt %lx",
1423 bp
, bp
->b_vp
, bp
->b_blkno
, bp
->b_bcount
);
1425 KASSERT(bp
->b_objlock
== &vp
->v_interlock
);
1426 if ((bp
->b_flags
& B_READ
) == 0) {
1427 mutex_enter(&vp
->v_interlock
);
1429 mutex_exit(&vp
->v_interlock
);
1431 VOP_STRATEGY(vp
, bp
);
1433 sdp
->swd_flags
&= ~SWF_BUSY
;
1437 * sw_reg_biodone: one of our i/o's has completed
1440 sw_reg_biodone(struct buf
*bp
)
1442 workqueue_enqueue(sw_reg_workqueue
, &bp
->b_work
, NULL
);
1446 * sw_reg_iodone: one of our i/o's has completed and needs post-i/o cleanup
1448 * => note that we can recover the vndbuf struct by casting the buf ptr
1451 sw_reg_iodone(struct work
*wk
, void *dummy
)
1453 struct vndbuf
*vbp
= (void *)wk
;
1454 struct vndxfer
*vnx
= vbp
->vb_xfer
;
1455 struct buf
*pbp
= vnx
->vx_bp
; /* parent buffer */
1456 struct swapdev
*sdp
= vnx
->vx_sdp
;
1457 int s
, resid
, error
;
1458 KASSERT(&vbp
->vb_buf
.b_work
== wk
);
1459 UVMHIST_FUNC("sw_reg_iodone"); UVMHIST_CALLED(pdhist
);
1461 UVMHIST_LOG(pdhist
, " vbp=%p vp=%p blkno=%x addr=%p",
1462 vbp
, vbp
->vb_buf
.b_vp
, vbp
->vb_buf
.b_blkno
, vbp
->vb_buf
.b_data
);
1463 UVMHIST_LOG(pdhist
, " cnt=%lx resid=%lx",
1464 vbp
->vb_buf
.b_bcount
, vbp
->vb_buf
.b_resid
, 0, 0);
1467 * protect vbp at splbio and update.
1471 resid
= vbp
->vb_buf
.b_bcount
- vbp
->vb_buf
.b_resid
;
1472 pbp
->b_resid
-= resid
;
1475 if (vbp
->vb_buf
.b_error
!= 0) {
1476 /* pass error upward */
1477 error
= vbp
->vb_buf
.b_error
? vbp
->vb_buf
.b_error
: EIO
;
1478 UVMHIST_LOG(pdhist
, " got error=%d !", error
, 0, 0, 0);
1479 vnx
->vx_error
= error
;
1483 * kill vbp structure
1485 buf_destroy(&vbp
->vb_buf
);
1486 pool_put(&vndbuf_pool
, vbp
);
1489 * wrap up this transaction if it has run to completion or, in
1490 * case of an error, when all auxiliary buffers have returned.
1492 if (vnx
->vx_error
!= 0) {
1493 /* pass error upward */
1494 error
= vnx
->vx_error
;
1495 if ((vnx
->vx_flags
& VX_BUSY
) == 0 && vnx
->vx_pending
== 0) {
1496 pbp
->b_error
= error
;
1498 pool_put(&vndxfer_pool
, vnx
);
1500 } else if (pbp
->b_resid
== 0) {
1501 KASSERT(vnx
->vx_pending
== 0);
1502 if ((vnx
->vx_flags
& VX_BUSY
) == 0) {
1503 UVMHIST_LOG(pdhist
, " iodone error=%d !",
1504 pbp
, vnx
->vx_error
, 0, 0);
1506 pool_put(&vndxfer_pool
, vnx
);
1511 * done! start next swapdev I/O if one is pending
1520 * uvm_swap_alloc: allocate space on swap
1522 * => allocation is done "round robin" down the priority list, as we
1523 * allocate in a priority we "rotate" the circle queue.
1524 * => space can be freed with uvm_swap_free
1525 * => we return the page slot number in /dev/drum (0 == invalid slot)
1526 * => we lock uvm_swap_data_lock
1527 * => XXXMRG: "LESSOK" INTERFACE NEEDED TO EXTENT SYSTEM
1530 uvm_swap_alloc(int *nslots
/* IN/OUT */, bool lessok
)
1532 struct swapdev
*sdp
;
1533 struct swappri
*spp
;
1534 UVMHIST_FUNC("uvm_swap_alloc"); UVMHIST_CALLED(pdhist
);
1537 * no swap devices configured yet? definite failure.
1539 if (uvmexp
.nswapdev
< 1)
1543 * lock data lock, convert slots into blocks, and enter loop
1545 mutex_enter(&uvm_swap_data_lock
);
1548 LIST_FOREACH(spp
, &swap_priority
, spi_swappri
) {
1549 CIRCLEQ_FOREACH(sdp
, &spp
->spi_swapdev
, swd_next
) {
1552 /* if it's not enabled, then we can't swap from it */
1553 if ((sdp
->swd_flags
& SWF_ENABLE
) == 0)
1555 if (sdp
->swd_npginuse
+ *nslots
> sdp
->swd_npages
)
1557 result
= blist_alloc(sdp
->swd_blist
, *nslots
);
1558 if (result
== BLIST_NONE
) {
1561 KASSERT(result
< sdp
->swd_drumsize
);
1564 * successful allocation! now rotate the circleq.
1566 CIRCLEQ_REMOVE(&spp
->spi_swapdev
, sdp
, swd_next
);
1567 CIRCLEQ_INSERT_TAIL(&spp
->spi_swapdev
, sdp
, swd_next
);
1568 sdp
->swd_npginuse
+= *nslots
;
1569 uvmexp
.swpginuse
+= *nslots
;
1570 mutex_exit(&uvm_swap_data_lock
);
1571 /* done! return drum slot number */
1573 "success! returning %d slots starting at %d",
1574 *nslots
, result
+ sdp
->swd_drumoffset
, 0, 0);
1575 return (result
+ sdp
->swd_drumoffset
);
1579 /* XXXMRG: BEGIN HACK */
1580 if (*nslots
> 1 && lessok
) {
1582 /* XXXMRG: ugh! blist should support this for us */
1585 /* XXXMRG: END HACK */
1587 mutex_exit(&uvm_swap_data_lock
);
1592 * uvm_swapisfull: return true if most of available swap is allocated
1593 * and in use. we don't count some small portion as it may be inaccessible
1594 * to us at any given moment, for example if there is lock contention or if
1598 uvm_swapisfull(void)
1603 mutex_enter(&uvm_swap_data_lock
);
1604 KASSERT(uvmexp
.swpgonly
<= uvmexp
.swpages
);
1605 swpgonly
= (int)((uint64_t)uvmexp
.swpgonly
* 100 /
1606 uvm_swapisfull_factor
);
1607 rv
= (swpgonly
>= uvmexp
.swpgavail
);
1608 mutex_exit(&uvm_swap_data_lock
);
1614 * uvm_swap_markbad: keep track of swap ranges where we've had i/o errors
1616 * => we lock uvm_swap_data_lock
1619 uvm_swap_markbad(int startslot
, int nslots
)
1621 struct swapdev
*sdp
;
1622 UVMHIST_FUNC("uvm_swap_markbad"); UVMHIST_CALLED(pdhist
);
1624 mutex_enter(&uvm_swap_data_lock
);
1625 sdp
= swapdrum_getsdp(startslot
);
1626 KASSERT(sdp
!= NULL
);
1629 * we just keep track of how many pages have been marked bad
1630 * in this device, to make everything add up in swap_off().
1631 * we assume here that the range of slots will all be within
1635 KASSERT(uvmexp
.swpgonly
>= nslots
);
1636 uvmexp
.swpgonly
-= nslots
;
1637 sdp
->swd_npgbad
+= nslots
;
1638 UVMHIST_LOG(pdhist
, "now %d bad", sdp
->swd_npgbad
, 0,0,0);
1639 mutex_exit(&uvm_swap_data_lock
);
1643 * uvm_swap_free: free swap slots
1645 * => this can be all or part of an allocation made by uvm_swap_alloc
1646 * => we lock uvm_swap_data_lock
1649 uvm_swap_free(int startslot
, int nslots
)
1651 struct swapdev
*sdp
;
1652 UVMHIST_FUNC("uvm_swap_free"); UVMHIST_CALLED(pdhist
);
1654 UVMHIST_LOG(pdhist
, "freeing %d slots starting at %d", nslots
,
1658 * ignore attempts to free the "bad" slot.
1661 if (startslot
== SWSLOT_BAD
) {
1666 * convert drum slot offset back to sdp, free the blocks
1667 * in the extent, and return. must hold pri lock to do
1668 * lookup and access the extent.
1671 mutex_enter(&uvm_swap_data_lock
);
1672 sdp
= swapdrum_getsdp(startslot
);
1673 KASSERT(uvmexp
.nswapdev
>= 1);
1674 KASSERT(sdp
!= NULL
);
1675 KASSERT(sdp
->swd_npginuse
>= nslots
);
1676 blist_free(sdp
->swd_blist
, startslot
- sdp
->swd_drumoffset
, nslots
);
1677 sdp
->swd_npginuse
-= nslots
;
1678 uvmexp
.swpginuse
-= nslots
;
1679 mutex_exit(&uvm_swap_data_lock
);
1683 * uvm_swap_put: put any number of pages into a contig place on swap
1685 * => can be sync or async
1689 uvm_swap_put(int swslot
, struct vm_page
**ppsp
, int npages
, int flags
)
1693 error
= uvm_swap_io(ppsp
, swslot
, npages
, B_WRITE
|
1694 ((flags
& PGO_SYNCIO
) ? 0 : B_ASYNC
));
1699 * uvm_swap_get: get a single page from swap
1701 * => usually a sync op (from fault)
1705 uvm_swap_get(struct vm_page
*page
, int swslot
, int flags
)
1710 KASSERT(flags
& PGO_SYNCIO
);
1711 if (swslot
== SWSLOT_BAD
) {
1715 error
= uvm_swap_io(&page
, swslot
, 1, B_READ
|
1716 ((flags
& PGO_SYNCIO
) ? 0 : B_ASYNC
));
1720 * this page is no longer only in swap.
1723 mutex_enter(&uvm_swap_data_lock
);
1724 KASSERT(uvmexp
.swpgonly
> 0);
1726 mutex_exit(&uvm_swap_data_lock
);
1732 * uvm_swap_io: do an i/o operation to swap
1736 uvm_swap_io(struct vm_page
**pps
, int startslot
, int npages
, int flags
)
1741 int error
, mapinflags
;
1743 UVMHIST_FUNC("uvm_swap_io"); UVMHIST_CALLED(pdhist
);
1745 UVMHIST_LOG(pdhist
, "<- called, startslot=%d, npages=%d, flags=%d",
1746 startslot
, npages
, flags
, 0);
1748 write
= (flags
& B_READ
) == 0;
1749 async
= (flags
& B_ASYNC
) != 0;
1752 * allocate a buf for the i/o.
1755 KASSERT(curlwp
!= uvm
.pagedaemon_lwp
|| (write
&& async
));
1756 bp
= getiobuf(swapdev_vp
, curlwp
!= uvm
.pagedaemon_lwp
);
1758 uvm_aio_aiodone_pages(pps
, npages
, true, ENOMEM
);
1763 * convert starting drum slot to block number
1766 startblk
= btodb((uint64_t)startslot
<< PAGE_SHIFT
);
1769 * first, map the pages into the kernel.
1772 mapinflags
= !write
?
1773 UVMPAGER_MAPIN_WAITOK
|UVMPAGER_MAPIN_READ
:
1774 UVMPAGER_MAPIN_WAITOK
|UVMPAGER_MAPIN_WRITE
;
1775 kva
= uvm_pagermapin(pps
, npages
, mapinflags
);
1778 * fill in the bp/sbp. we currently route our i/o through
1779 * /dev/drum's vnode [swapdev_vp].
1782 bp
->b_cflags
= BC_BUSY
| BC_NOCACHE
;
1783 bp
->b_flags
= (flags
& (B_READ
|B_ASYNC
));
1784 bp
->b_proc
= &proc0
; /* XXX */
1785 bp
->b_vnbufs
.le_next
= NOLIST
;
1786 bp
->b_data
= (void *)kva
;
1787 bp
->b_blkno
= startblk
;
1788 bp
->b_bufsize
= bp
->b_bcount
= npages
<< PAGE_SHIFT
;
1791 * bump v_numoutput (counter of number of active outputs).
1795 mutex_enter(&swapdev_vp
->v_interlock
);
1796 swapdev_vp
->v_numoutput
++;
1797 mutex_exit(&swapdev_vp
->v_interlock
);
1801 * for async ops we must set up the iodone handler.
1805 bp
->b_iodone
= uvm_aio_biodone
;
1806 UVMHIST_LOG(pdhist
, "doing async!", 0, 0, 0, 0);
1807 if (curlwp
== uvm
.pagedaemon_lwp
)
1808 BIO_SETPRIO(bp
, BPRIO_TIMECRITICAL
);
1810 BIO_SETPRIO(bp
, BPRIO_TIMELIMITED
);
1812 bp
->b_iodone
= NULL
;
1813 BIO_SETPRIO(bp
, BPRIO_TIMECRITICAL
);
1816 "about to start io: data = %p blkno = 0x%x, bcount = %ld",
1817 bp
->b_data
, bp
->b_blkno
, bp
->b_bcount
, 0);
1820 * now we start the I/O, and if async, return.
1823 VOP_STRATEGY(swapdev_vp
, bp
);
1828 * must be sync i/o. wait for it to finish
1831 error
= biowait(bp
);
1834 * kill the pager mapping
1837 uvm_pagermapout(kva
, npages
);
1840 * now dispose of the buf and we're done.
1844 mutex_enter(&swapdev_vp
->v_interlock
);
1846 mutex_exit(&swapdev_vp
->v_interlock
);
1849 UVMHIST_LOG(pdhist
, "<- done (sync) error=%d", error
, 0, 0, 0);