sys/uvm/uvm_swap.c

   1 /*      $NetBSD: uvm_swap.c,v 1.146 2009/09/13 18:45:12 pooka Exp $     */
   2
   3 /*
   4  * Copyright (c) 1995, 1996, 1997, 2009 Matthew R. Green
   5  * All rights reserved.
   6  *
   7  * Redistribution and use in source and binary forms, with or without
   8  * modification, are permitted provided that the following conditions
   9  * are met:
  10  * 1. Redistributions of source code must retain the above copyright
  11  *    notice, this list of conditions and the following disclaimer.
  12  * 2. Redistributions in binary form must reproduce the above copyright
  13  *    notice, this list of conditions and the following disclaimer in the
  14  *    documentation and/or other materials provided with the distribution.
  15  *
  16  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
  17  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
  18  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
  19  * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
  20  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
  21  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
  22  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
  23  * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
  24  * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  25  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  26  * SUCH DAMAGE.
  27  *
  28  * from: NetBSD: vm_swap.c,v 1.52 1997/12/02 13:47:37 pk Exp
  29  * from: Id: uvm_swap.c,v 1.1.2.42 1998/02/02 20:38:06 chuck Exp
  30  */
  31
  32 #include <sys/cdefs.h>
  33 __KERNEL_RCSID(0, "$NetBSD: uvm_swap.c,v 1.146 2009/09/13 18:45:12 pooka Exp $");
  34
  35 #include "fs_nfs.h"
  36 #include "opt_uvmhist.h"
  37 #include "opt_compat_netbsd.h"
  38 #include "opt_ddb.h"
  39
  40 #include <sys/param.h>
  41 #include <sys/systm.h>
  42 #include <sys/buf.h>
  43 #include <sys/bufq.h>
  44 #include <sys/conf.h>
  45 #include <sys/proc.h>
  46 #include <sys/namei.h>
  47 #include <sys/disklabel.h>
  48 #include <sys/errno.h>
  49 #include <sys/kernel.h>
  50 #include <sys/malloc.h>
  51 #include <sys/vnode.h>
  52 #include <sys/file.h>
  53 #include <sys/vmem.h>
  54 #include <sys/blist.h>
  55 #include <sys/mount.h>
  56 #include <sys/pool.h>
  57 #include <sys/syscallargs.h>
  58 #include <sys/swap.h>
  59 #include <sys/kauth.h>
  60 #include <sys/sysctl.h>
  61 #include <sys/workqueue.h>
  62
  63 #include <uvm/uvm.h>
  64
  65 #include <miscfs/specfs/specdev.h>
  66
  67 /*
  68  * uvm_swap.c: manage configuration and i/o to swap space.
  69  */
  70
  71 /*
  72  * swap space is managed in the following way:
  73  *
  74  * each swap partition or file is described by a "swapdev" structure.
  75  * each "swapdev" structure contains a "swapent" structure which contains
  76  * information that is passed up to the user (via system calls).
  77  *
  78  * each swap partition is assigned a "priority" (int) which controls
  79  * swap parition usage.
  80  *
  81  * the system maintains a global data structure describing all swap
  82  * partitions/files.   there is a sorted LIST of "swappri" structures
  83  * which describe "swapdev"'s at that priority.   this LIST is headed
  84  * by the "swap_priority" global var.    each "swappri" contains a
  85  * CIRCLEQ of "swapdev" structures at that priority.
  86  *
  87  * locking:
  88  *  - swap_syscall_lock (krwlock_t): this lock serializes the swapctl
  89  *    system call and prevents the swap priority list from changing
  90  *    while we are in the middle of a system call (e.g. SWAP_STATS).
  91  *  - uvm_swap_data_lock (kmutex_t): this lock protects all swap data
  92  *    structures including the priority list, the swapdev structures,
  93  *    and the swapmap arena.
  94  *
  95  * each swap device has the following info:
  96  *  - swap device in use (could be disabled, preventing future use)
  97  *  - swap enabled (allows new allocations on swap)
  98  *  - map info in /dev/drum
  99  *  - vnode pointer
 100  * for swap files only:
 101  *  - block size
 102  *  - max byte count in buffer
 103  *  - buffer
 104  *
 105  * userland controls and configures swap with the swapctl(2) system call.
 106  * the sys_swapctl performs the following operations:
 107  *  [1] SWAP_NSWAP: returns the number of swap devices currently configured
 108  *  [2] SWAP_STATS: given a pointer to an array of swapent structures
 109  *      (passed in via "arg") of a size passed in via "misc" ... we load
 110  *      the current swap config into the array. The actual work is done
 111  *      in the uvm_swap_stats(9) function.
 112  *  [3] SWAP_ON: given a pathname in arg (could be device or file) and a
 113  *      priority in "misc", start swapping on it.
 114  *  [4] SWAP_OFF: as SWAP_ON, but stops swapping to a device
 115  *  [5] SWAP_CTL: changes the priority of a swap device (new priority in
 116  *      "misc")
 117  */
 118
 119 /*
 120  * swapdev: describes a single swap partition/file
 121  *
 122  * note the following should be true:
 123  * swd_inuse <= swd_nblks  [number of blocks in use is <= total blocks]
 124  * swd_nblks <= swd_mapsize [because mapsize includes miniroot+disklabel]
 125  */
 126 struct swapdev {
 127         dev_t                   swd_dev;        /* device id */
 128         int                     swd_flags;      /* flags:inuse/enable/fake */
 129         int                     swd_priority;   /* our priority */
 130         int                     swd_nblks;      /* blocks in this device */
 131         char                    *swd_path;      /* saved pathname of device */
 132         int                     swd_pathlen;    /* length of pathname */
 133         int                     swd_npages;     /* #pages we can use */
 134         int                     swd_npginuse;   /* #pages in use */
 135         int                     swd_npgbad;     /* #pages bad */
 136         int                     swd_drumoffset; /* page0 offset in drum */
 137         int                     swd_drumsize;   /* #pages in drum */
 138         blist_t                 swd_blist;      /* blist for this swapdev */
 139         struct vnode            *swd_vp;        /* backing vnode */
 140         CIRCLEQ_ENTRY(swapdev)  swd_next;       /* priority circleq */
 141
 142         int                     swd_bsize;      /* blocksize (bytes) */
 143         int                     swd_maxactive;  /* max active i/o reqs */
 144         struct bufq_state       *swd_tab;       /* buffer list */
 145         int                     swd_active;     /* number of active buffers */
 146 };
 147
 148 /*
 149  * swap device priority entry; the list is kept sorted on `spi_priority'.
 150  */
 151 struct swappri {
 152         int                     spi_priority;     /* priority */
 153         CIRCLEQ_HEAD(spi_swapdev, swapdev)      spi_swapdev;
 154         /* circleq of swapdevs at this priority */
 155         LIST_ENTRY(swappri)     spi_swappri;      /* global list of pri's */
 156 };
 157
 158 /*
 159  * The following two structures are used to keep track of data transfers
 160  * on swap devices associated with regular files.
 161  * NOTE: this code is more or less a copy of vnd.c; we use the same
 162  * structure names here to ease porting..
 163  */
 164 struct vndxfer {
 165         struct buf      *vx_bp;         /* Pointer to parent buffer */
 166         struct swapdev  *vx_sdp;
 167         int             vx_error;
 168         int             vx_pending;     /* # of pending aux buffers */
 169         int             vx_flags;
 170 #define VX_BUSY         1
 171 #define VX_DEAD         2
 172 };
 173
 174 struct vndbuf {
 175         struct buf      vb_buf;
 176         struct vndxfer  *vb_xfer;
 177 };
 178
 179 /*
 180  * NetBSD 1.3 swapctl(SWAP_STATS, ...) swapent structure; uses 32 bit
 181  * dev_t and has no se_path[] member.
 182  */
 183 struct swapent13 {
 184         int32_t se13_dev;               /* device id */
 185         int     se13_flags;             /* flags */
 186         int     se13_nblks;             /* total blocks */
 187         int     se13_inuse;             /* blocks in use */
 188         int     se13_priority;          /* priority of this device */
 189 };
 190
 191 /*
 192  * NetBSD 5.0 swapctl(SWAP_STATS, ...) swapent structure; uses 32 bit
 193  * dev_t.
 194  */
 195 struct swapent50 {
 196         int32_t se50_dev;               /* device id */
 197         int     se50_flags;             /* flags */
 198         int     se50_nblks;             /* total blocks */
 199         int     se50_inuse;             /* blocks in use */
 200         int     se50_priority;          /* priority of this device */
 201         char    se50_path[PATH_MAX+1];  /* path name */
 202 };
 203
 204 /*
 205  * We keep a of pool vndbuf's and vndxfer structures.
 206  */
 207 static struct pool vndxfer_pool, vndbuf_pool;
 208
 209 /*
 210  * local variables
 211  */
 212 MALLOC_DEFINE(M_VMSWAP, "VM swap", "VM swap structures");
 213 static vmem_t *swapmap; /* controls the mapping of /dev/drum */
 214
 215 /* list of all active swap devices [by priority] */
 216 LIST_HEAD(swap_priority, swappri);
 217 static struct swap_priority swap_priority;
 218
 219 /* locks */
 220 static krwlock_t swap_syscall_lock;
 221
 222 /* workqueue and use counter for swap to regular files */
 223 static int sw_reg_count = 0;
 224 static struct workqueue *sw_reg_workqueue;
 225
 226 /* tuneables */
 227 u_int uvm_swapisfull_factor = 99;
 228
 229 /*
 230  * prototypes
 231  */
 232 static struct swapdev   *swapdrum_getsdp(int);
 233
 234 static struct swapdev   *swaplist_find(struct vnode *, bool);
 235 static void              swaplist_insert(struct swapdev *,
 236                                          struct swappri *, int);
 237 static void              swaplist_trim(void);
 238
 239 static int swap_on(struct lwp *, struct swapdev *);
 240 static int swap_off(struct lwp *, struct swapdev *);
 241
 242 static void uvm_swap_stats_locked(int, struct swapent *, int, register_t *);
 243
 244 static void sw_reg_strategy(struct swapdev *, struct buf *, int);
 245 static void sw_reg_biodone(struct buf *);
 246 static void sw_reg_iodone(struct work *wk, void *dummy);
 247 static void sw_reg_start(struct swapdev *);
 248
 249 static int uvm_swap_io(struct vm_page **, int, int, int);
 250
 251 /*
 252  * uvm_swap_init: init the swap system data structures and locks
 253  *
 254  * => called at boot time from init_main.c after the filesystems
 255  *      are brought up (which happens after uvm_init())
 256  */
 257 void
 258 uvm_swap_init(void)
 259 {
 260         UVMHIST_FUNC("uvm_swap_init");
 261
 262         UVMHIST_CALLED(pdhist);
 263         /*
 264          * first, init the swap list, its counter, and its lock.
 265          * then get a handle on the vnode for /dev/drum by using
 266          * the its dev_t number ("swapdev", from MD conf.c).
 267          */
 268
 269         LIST_INIT(&swap_priority);
 270         uvmexp.nswapdev = 0;
 271         rw_init(&swap_syscall_lock);
 272         mutex_init(&uvm_swap_data_lock, MUTEX_DEFAULT, IPL_NONE);
 273
 274         if (bdevvp(swapdev, &swapdev_vp))
 275                 panic("%s: can't get vnode for swap device", __func__);
 276         if (vn_lock(swapdev_vp, LK_EXCLUSIVE | LK_RETRY))
 277                 panic("%s: can't lock swap device", __func__);
 278         if (VOP_OPEN(swapdev_vp, FREAD | FWRITE, NOCRED))
 279                 panic("%s: can't open swap device", __func__);
 280         VOP_UNLOCK(swapdev_vp, 0);
 281
 282         /*
 283          * create swap block resource map to map /dev/drum.   the range
 284          * from 1 to INT_MAX allows 2 gigablocks of swap space.  note
 285          * that block 0 is reserved (used to indicate an allocation
 286          * failure, or no allocation).
 287          */
 288         swapmap = vmem_create("swapmap", 1, INT_MAX - 1, 1, NULL, NULL, NULL, 0,
 289             VM_NOSLEEP, IPL_NONE);
 290         if (swapmap == 0) {
 291                 panic("%s: vmem_create failed", __func__);
 292         }
 293
 294         pool_init(&vndxfer_pool, sizeof(struct vndxfer), 0, 0, 0, "swp vnx",
 295             NULL, IPL_BIO);
 296         pool_init(&vndbuf_pool, sizeof(struct vndbuf), 0, 0, 0, "swp vnd",
 297             NULL, IPL_BIO);
 298
 299         UVMHIST_LOG(pdhist, "<- done", 0, 0, 0, 0);
 300 }
 301
 302 /*
 303  * swaplist functions: functions that operate on the list of swap
 304  * devices on the system.
 305  */
 306
 307 /*
 308  * swaplist_insert: insert swap device "sdp" into the global list
 309  *
 310  * => caller must hold both swap_syscall_lock and uvm_swap_data_lock
 311  * => caller must provide a newly malloc'd swappri structure (we will
 312  *      FREE it if we don't need it... this it to prevent malloc blocking
 313  *      here while adding swap)
 314  */
 315 static void
 316 swaplist_insert(struct swapdev *sdp, struct swappri *newspp, int priority)
 317 {
 318         struct swappri *spp, *pspp;
 319         UVMHIST_FUNC("swaplist_insert"); UVMHIST_CALLED(pdhist);
 320
 321         /*
 322          * find entry at or after which to insert the new device.
 323          */
 324         pspp = NULL;
 325         LIST_FOREACH(spp, &swap_priority, spi_swappri) {
 326                 if (priority <= spp->spi_priority)
 327                         break;
 328                 pspp = spp;
 329         }
 330
 331         /*
 332          * new priority?
 333          */
 334         if (spp == NULL || spp->spi_priority != priority) {
 335                 spp = newspp;  /* use newspp! */
 336                 UVMHIST_LOG(pdhist, "created new swappri = %d",
 337                             priority, 0, 0, 0);
 338
 339                 spp->spi_priority = priority;
 340                 CIRCLEQ_INIT(&spp->spi_swapdev);
 341
 342                 if (pspp)
 343                         LIST_INSERT_AFTER(pspp, spp, spi_swappri);
 344                 else
 345                         LIST_INSERT_HEAD(&swap_priority, spp, spi_swappri);
 346         } else {
 347                 /* we don't need a new priority structure, free it */
 348                 free(newspp, M_VMSWAP);
 349         }
 350
 351         /*
 352          * priority found (or created).   now insert on the priority's
 353          * circleq list and bump the total number of swapdevs.
 354          */
 355         sdp->swd_priority = priority;
 356         CIRCLEQ_INSERT_TAIL(&spp->spi_swapdev, sdp, swd_next);
 357         uvmexp.nswapdev++;
 358 }
 359
 360 /*
 361  * swaplist_find: find and optionally remove a swap device from the
 362  *      global list.
 363  *
 364  * => caller must hold both swap_syscall_lock and uvm_swap_data_lock
 365  * => we return the swapdev we found (and removed)
 366  */
 367 static struct swapdev *
 368 swaplist_find(struct vnode *vp, bool remove)
 369 {
 370         struct swapdev *sdp;
 371         struct swappri *spp;
 372
 373         /*
 374          * search the lists for the requested vp
 375          */
 376
 377         LIST_FOREACH(spp, &swap_priority, spi_swappri) {
 378                 CIRCLEQ_FOREACH(sdp, &spp->spi_swapdev, swd_next) {
 379                         if (sdp->swd_vp == vp) {
 380                                 if (remove) {
 381                                         CIRCLEQ_REMOVE(&spp->spi_swapdev,
 382                                             sdp, swd_next);
 383                                         uvmexp.nswapdev--;
 384                                 }
 385                                 return(sdp);
 386                         }
 387                 }
 388         }
 389         return (NULL);
 390 }
 391
 392 /*
 393  * swaplist_trim: scan priority list for empty priority entries and kill
 394  *      them.
 395  *
 396  * => caller must hold both swap_syscall_lock and uvm_swap_data_lock
 397  */
 398 static void
 399 swaplist_trim(void)
 400 {
 401         struct swappri *spp, *nextspp;
 402
 403         for (spp = LIST_FIRST(&swap_priority); spp != NULL; spp = nextspp) {
 404                 nextspp = LIST_NEXT(spp, spi_swappri);
 405                 if (CIRCLEQ_FIRST(&spp->spi_swapdev) !=
 406                     (void *)&spp->spi_swapdev)
 407                         continue;
 408                 LIST_REMOVE(spp, spi_swappri);
 409                 free(spp, M_VMSWAP);
 410         }
 411 }
 412
 413 /*
 414  * swapdrum_getsdp: given a page offset in /dev/drum, convert it back
 415  *      to the "swapdev" that maps that section of the drum.
 416  *
 417  * => each swapdev takes one big contig chunk of the drum
 418  * => caller must hold uvm_swap_data_lock
 419  */
 420 static struct swapdev *
 421 swapdrum_getsdp(int pgno)
 422 {
 423         struct swapdev *sdp;
 424         struct swappri *spp;
 425
 426         LIST_FOREACH(spp, &swap_priority, spi_swappri) {
 427                 CIRCLEQ_FOREACH(sdp, &spp->spi_swapdev, swd_next) {
 428                         if (sdp->swd_flags & SWF_FAKE)
 429                                 continue;
 430                         if (pgno >= sdp->swd_drumoffset &&
 431                             pgno < (sdp->swd_drumoffset + sdp->swd_drumsize)) {
 432                                 return sdp;
 433                         }
 434                 }
 435         }
 436         return NULL;
 437 }
 438
 439
 440 /*
 441  * sys_swapctl: main entry point for swapctl(2) system call
 442  *      [with two helper functions: swap_on and swap_off]
 443  */
 444 int
 445 sys_swapctl(struct lwp *l, const struct sys_swapctl_args *uap, register_t *retval)
 446 {
 447         /* {
 448                 syscallarg(int) cmd;
 449                 syscallarg(void *) arg;
 450                 syscallarg(int) misc;
 451         } */
 452         struct vnode *vp;
 453         struct nameidata nd;
 454         struct swappri *spp;
 455         struct swapdev *sdp;
 456         struct swapent *sep;
 457 #define SWAP_PATH_MAX (PATH_MAX + 1)
 458         char    *userpath;
 459         size_t  len;
 460         int     error, misc;
 461         int     priority;
 462         UVMHIST_FUNC("sys_swapctl"); UVMHIST_CALLED(pdhist);
 463
 464         misc = SCARG(uap, misc);
 465
 466         /*
 467          * ensure serialized syscall access by grabbing the swap_syscall_lock
 468          */
 469         rw_enter(&swap_syscall_lock, RW_WRITER);
 470
 471         userpath = malloc(SWAP_PATH_MAX, M_TEMP, M_WAITOK);
 472         /*
 473          * we handle the non-priv NSWAP and STATS request first.
 474          *
 475          * SWAP_NSWAP: return number of config'd swap devices
 476          * [can also be obtained with uvmexp sysctl]
 477          */
 478         if (SCARG(uap, cmd) == SWAP_NSWAP) {
 479                 UVMHIST_LOG(pdhist, "<- done SWAP_NSWAP=%d", uvmexp.nswapdev,
 480                     0, 0, 0);
 481                 *retval = uvmexp.nswapdev;
 482                 error = 0;
 483                 goto out;
 484         }
 485
 486         /*
 487          * SWAP_STATS: get stats on current # of configured swap devs
 488          *
 489          * note that the swap_priority list can't change as long
 490          * as we are holding the swap_syscall_lock.  we don't want
 491          * to grab the uvm_swap_data_lock because we may fault&sleep during
 492          * copyout() and we don't want to be holding that lock then!
 493          */
 494         if (SCARG(uap, cmd) == SWAP_STATS
 495 #if defined(COMPAT_50)
 496             || SCARG(uap, cmd) == SWAP_STATS50
 497 #endif
 498 #if defined(COMPAT_13)
 499             || SCARG(uap, cmd) == SWAP_STATS13
 500 #endif
 501             ) {
 502                 if ((size_t)misc > (size_t)uvmexp.nswapdev)
 503                         misc = uvmexp.nswapdev;
 504 #if defined(COMPAT_13)
 505                 if (SCARG(uap, cmd) == SWAP_STATS13)
 506                         len = sizeof(struct swapent13) * misc;
 507                 else
 508 #endif
 509 #if defined(COMPAT_50)
 510                 if (SCARG(uap, cmd) == SWAP_STATS50)
 511                         len = sizeof(struct swapent50) * misc;
 512                 else
 513 #endif
 514                         len = sizeof(struct swapent) * misc;
 515                 sep = (struct swapent *)malloc(len, M_TEMP, M_WAITOK);
 516
 517                 uvm_swap_stats_locked(SCARG(uap, cmd), sep, misc, retval);
 518                 error = copyout(sep, SCARG(uap, arg), len);
 519
 520                 free(sep, M_TEMP);
 521                 UVMHIST_LOG(pdhist, "<- done SWAP_STATS", 0, 0, 0, 0);
 522                 goto out;
 523         }
 524         if (SCARG(uap, cmd) == SWAP_GETDUMPDEV) {
 525                 dev_t   *devp = (dev_t *)SCARG(uap, arg);
 526
 527                 error = copyout(&dumpdev, devp, sizeof(dumpdev));
 528                 goto out;
 529         }
 530
 531         /*
 532          * all other requests require superuser privs.   verify.
 533          */
 534         if ((error = kauth_authorize_system(l->l_cred, KAUTH_SYSTEM_SWAPCTL,
 535             0, NULL, NULL, NULL)))
 536                 goto out;
 537
 538         if (SCARG(uap, cmd) == SWAP_DUMPOFF) {
 539                 /* drop the current dump device */
 540                 dumpdev = NODEV;
 541                 dumpcdev = NODEV;
 542                 cpu_dumpconf();
 543                 goto out;
 544         }
 545
 546         /*
 547          * at this point we expect a path name in arg.   we will
 548          * use namei() to gain a vnode reference (vref), and lock
 549          * the vnode (VOP_LOCK).
 550          *
 551          * XXX: a NULL arg means use the root vnode pointer (e.g. for
 552          * miniroot)
 553          */
 554         if (SCARG(uap, arg) == NULL) {
 555                 vp = rootvp;            /* miniroot */
 556                 if (vget(vp, LK_EXCLUSIVE)) {
 557                         error = EBUSY;
 558                         goto out;
 559                 }
 560                 if (SCARG(uap, cmd) == SWAP_ON &&
 561                     copystr("miniroot", userpath, SWAP_PATH_MAX, &len))
 562                         panic("swapctl: miniroot copy failed");
 563         } else {
 564                 int     space;
 565                 char    *where;
 566
 567                 if (SCARG(uap, cmd) == SWAP_ON) {
 568                         if ((error = copyinstr(SCARG(uap, arg), userpath,
 569                             SWAP_PATH_MAX, &len)))
 570                                 goto out;
 571                         space = UIO_SYSSPACE;
 572                         where = userpath;
 573                 } else {
 574                         space = UIO_USERSPACE;
 575                         where = (char *)SCARG(uap, arg);
 576                 }
 577                 NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF | TRYEMULROOT,
 578                     space, where);
 579                 if ((error = namei(&nd)))
 580                         goto out;
 581                 vp = nd.ni_vp;
 582         }
 583         /* note: "vp" is referenced and locked */
 584
 585         error = 0;              /* assume no error */
 586         switch(SCARG(uap, cmd)) {
 587
 588         case SWAP_DUMPDEV:
 589                 if (vp->v_type != VBLK) {
 590                         error = ENOTBLK;
 591                         break;
 592                 }
 593                 if (bdevsw_lookup(vp->v_rdev)) {
 594                         dumpdev = vp->v_rdev;
 595                         dumpcdev = devsw_blk2chr(dumpdev);
 596                 } else
 597                         dumpdev = NODEV;
 598                 cpu_dumpconf();
 599                 break;
 600
 601         case SWAP_CTL:
 602                 /*
 603                  * get new priority, remove old entry (if any) and then
 604                  * reinsert it in the correct place.  finally, prune out
 605                  * any empty priority structures.
 606                  */
 607                 priority = SCARG(uap, misc);
 608                 spp = malloc(sizeof *spp, M_VMSWAP, M_WAITOK);
 609                 mutex_enter(&uvm_swap_data_lock);
 610                 if ((sdp = swaplist_find(vp, true)) == NULL) {
 611                         error = ENOENT;
 612                 } else {
 613                         swaplist_insert(sdp, spp, priority);
 614                         swaplist_trim();
 615                 }
 616                 mutex_exit(&uvm_swap_data_lock);
 617                 if (error)
 618                         free(spp, M_VMSWAP);
 619                 break;
 620
 621         case SWAP_ON:
 622
 623                 /*
 624                  * check for duplicates.   if none found, then insert a
 625                  * dummy entry on the list to prevent someone else from
 626                  * trying to enable this device while we are working on
 627                  * it.
 628                  */
 629
 630                 priority = SCARG(uap, misc);
 631                 sdp = malloc(sizeof *sdp, M_VMSWAP, M_WAITOK);
 632                 spp = malloc(sizeof *spp, M_VMSWAP, M_WAITOK);
 633                 memset(sdp, 0, sizeof(*sdp));
 634                 sdp->swd_flags = SWF_FAKE;
 635                 sdp->swd_vp = vp;
 636                 sdp->swd_dev = (vp->v_type == VBLK) ? vp->v_rdev : NODEV;
 637                 bufq_alloc(&sdp->swd_tab, "disksort", BUFQ_SORT_RAWBLOCK);
 638                 mutex_enter(&uvm_swap_data_lock);
 639                 if (swaplist_find(vp, false) != NULL) {
 640                         error = EBUSY;
 641                         mutex_exit(&uvm_swap_data_lock);
 642                         bufq_free(sdp->swd_tab);
 643                         free(sdp, M_VMSWAP);
 644                         free(spp, M_VMSWAP);
 645                         break;
 646                 }
 647                 swaplist_insert(sdp, spp, priority);
 648                 mutex_exit(&uvm_swap_data_lock);
 649
 650                 sdp->swd_pathlen = len;
 651                 sdp->swd_path = malloc(sdp->swd_pathlen, M_VMSWAP, M_WAITOK);
 652                 if (copystr(userpath, sdp->swd_path, sdp->swd_pathlen, 0) != 0)
 653                         panic("swapctl: copystr");
 654
 655                 /*
 656                  * we've now got a FAKE placeholder in the swap list.
 657                  * now attempt to enable swap on it.  if we fail, undo
 658                  * what we've done and kill the fake entry we just inserted.
 659                  * if swap_on is a success, it will clear the SWF_FAKE flag
 660                  */
 661
 662                 if ((error = swap_on(l, sdp)) != 0) {
 663                         mutex_enter(&uvm_swap_data_lock);
 664                         (void) swaplist_find(vp, true);  /* kill fake entry */
 665                         swaplist_trim();
 666                         mutex_exit(&uvm_swap_data_lock);
 667                         bufq_free(sdp->swd_tab);
 668                         free(sdp->swd_path, M_VMSWAP);
 669                         free(sdp, M_VMSWAP);
 670                         break;
 671                 }
 672                 break;
 673
 674         case SWAP_OFF:
 675                 mutex_enter(&uvm_swap_data_lock);
 676                 if ((sdp = swaplist_find(vp, false)) == NULL) {
 677                         mutex_exit(&uvm_swap_data_lock);
 678                         error = ENXIO;
 679                         break;
 680                 }
 681
 682                 /*
 683                  * If a device isn't in use or enabled, we
 684                  * can't stop swapping from it (again).
 685                  */
 686                 if ((sdp->swd_flags & (SWF_INUSE|SWF_ENABLE)) == 0) {
 687                         mutex_exit(&uvm_swap_data_lock);
 688                         error = EBUSY;
 689                         break;
 690                 }
 691
 692                 /*
 693                  * do the real work.
 694                  */
 695                 error = swap_off(l, sdp);
 696                 break;
 697
 698         default:
 699                 error = EINVAL;
 700         }
 701
 702         /*
 703          * done!  release the ref gained by namei() and unlock.
 704          */
 705         vput(vp);
 706
 707 out:
 708         free(userpath, M_TEMP);
 709         rw_exit(&swap_syscall_lock);
 710
 711         UVMHIST_LOG(pdhist, "<- done!  error=%d", error, 0, 0, 0);
 712         return (error);
 713 }
 714
 715 /*
 716  * swap_stats: implements swapctl(SWAP_STATS). The function is kept
 717  * away from sys_swapctl() in order to allow COMPAT_* swapctl()
 718  * emulation to use it directly without going through sys_swapctl().
 719  * The problem with using sys_swapctl() there is that it involves
 720  * copying the swapent array to the stackgap, and this array's size
 721  * is not known at build time. Hence it would not be possible to
 722  * ensure it would fit in the stackgap in any case.
 723  */
 724 void
 725 uvm_swap_stats(int cmd, struct swapent *sep, int sec, register_t *retval)
 726 {
 727
 728         rw_enter(&swap_syscall_lock, RW_READER);
 729         uvm_swap_stats_locked(cmd, sep, sec, retval);
 730         rw_exit(&swap_syscall_lock);
 731 }
 732
 733 static void
 734 uvm_swap_stats_locked(int cmd, struct swapent *sep, int sec, register_t *retval)
 735 {
 736         struct swappri *spp;
 737         struct swapdev *sdp;
 738         int count = 0;
 739
 740         LIST_FOREACH(spp, &swap_priority, spi_swappri) {
 741                 for (sdp = CIRCLEQ_FIRST(&spp->spi_swapdev);
 742                      sdp != (void *)&spp->spi_swapdev && sec-- > 0;
 743                      sdp = CIRCLEQ_NEXT(sdp, swd_next)) {
 744                         int inuse;
 745
 746                         /*
 747                          * backwards compatibility for system call.
 748                          * For NetBSD 1.3 and 5.0, we have to use
 749                          * the 32 bit dev_t.  For 5.0 and -current
 750                          * we have to add the path.
 751                          */
 752                         inuse = btodb((uint64_t)sdp->swd_npginuse <<
 753                             PAGE_SHIFT);
 754
 755 #if defined(COMPAT_13) || defined(COMPAT_50)
 756                         if (cmd == SWAP_STATS) {
 757 #endif
 758                                 sep->se_dev = sdp->swd_dev;
 759                                 sep->se_flags = sdp->swd_flags;
 760                                 sep->se_nblks = sdp->swd_nblks;
 761                                 sep->se_inuse = inuse;
 762                                 sep->se_priority = sdp->swd_priority;
 763                                 memcpy(&sep->se_path, sdp->swd_path,
 764                                        sizeof sep->se_path);
 765                                 sep++;
 766 #if defined(COMPAT_13)
 767                         } else if (cmd == SWAP_STATS13) {
 768                                 struct swapent13 *sep13 =
 769                                     (struct swapent13 *)sep;
 770
 771                                 sep13->se13_dev = sdp->swd_dev;
 772                                 sep13->se13_flags = sdp->swd_flags;
 773                                 sep13->se13_nblks = sdp->swd_nblks;
 774                                 sep13->se13_inuse = inuse;
 775                                 sep13->se13_priority = sdp->swd_priority;
 776                                 sep = (struct swapent *)(sep13 + 1);
 777 #endif
 778 #if defined(COMPAT_50)
 779                         } else if (cmd == SWAP_STATS50) {
 780                                 struct swapent50 *sep50 =
 781                                     (struct swapent50 *)sep;
 782
 783                                 sep50->se50_dev = sdp->swd_dev;
 784                                 sep50->se50_flags = sdp->swd_flags;
 785                                 sep50->se50_nblks = sdp->swd_nblks;
 786                                 sep50->se50_inuse = inuse;
 787                                 sep50->se50_priority = sdp->swd_priority;
 788                                 memcpy(&sep50->se50_path, sdp->swd_path,
 789                                        sizeof sep50->se50_path);
 790                                 sep = (struct swapent *)(sep50 + 1);
 791                         }
 792 #endif
 793                         count++;
 794                 }
 795         }
 796
 797         *retval = count;
 798         return;
 799 }
 800
 801 /*
 802  * swap_on: attempt to enable a swapdev for swapping.   note that the
 803  *      swapdev is already on the global list, but disabled (marked
 804  *      SWF_FAKE).
 805  *
 806  * => we avoid the start of the disk (to protect disk labels)
 807  * => we also avoid the miniroot, if we are swapping to root.
 808  * => caller should leave uvm_swap_data_lock unlocked, we may lock it
 809  *      if needed.
 810  */
 811 static int
 812 swap_on(struct lwp *l, struct swapdev *sdp)
 813 {
 814         struct vnode *vp;
 815         int error, npages, nblocks, size;
 816         long addr;
 817         u_long result;
 818         struct vattr va;
 819 #ifdef NFS
 820         extern int (**nfsv2_vnodeop_p)(void *);
 821 #endif /* NFS */
 822         const struct bdevsw *bdev;
 823         dev_t dev;
 824         UVMHIST_FUNC("swap_on"); UVMHIST_CALLED(pdhist);
 825
 826         /*
 827          * we want to enable swapping on sdp.   the swd_vp contains
 828          * the vnode we want (locked and ref'd), and the swd_dev
 829          * contains the dev_t of the file, if it a block device.
 830          */
 831
 832         vp = sdp->swd_vp;
 833         dev = sdp->swd_dev;
 834
 835         /*
 836          * open the swap file (mostly useful for block device files to
 837          * let device driver know what is up).
 838          *
 839          * we skip the open/close for root on swap because the root
 840          * has already been opened when root was mounted (mountroot).
 841          */
 842         if (vp != rootvp) {
 843                 if ((error = VOP_OPEN(vp, FREAD|FWRITE, l->l_cred)))
 844                         return (error);
 845         }
 846
 847         /* XXX this only works for block devices */
 848         UVMHIST_LOG(pdhist, "  dev=%d, major(dev)=%d", dev, major(dev), 0,0);
 849
 850         /*
 851          * we now need to determine the size of the swap area.   for
 852          * block specials we can call the d_psize function.
 853          * for normal files, we must stat [get attrs].
 854          *
 855          * we put the result in nblks.
 856          * for normal files, we also want the filesystem block size
 857          * (which we get with statfs).
 858          */
 859         switch (vp->v_type) {
 860         case VBLK:
 861                 bdev = bdevsw_lookup(dev);
 862                 if (bdev == NULL || bdev->d_psize == NULL ||
 863                     (nblocks = (*bdev->d_psize)(dev)) == -1) {
 864                         error = ENXIO;
 865                         goto bad;
 866                 }
 867                 break;
 868
 869         case VREG:
 870                 if ((error = VOP_GETATTR(vp, &va, l->l_cred)))
 871                         goto bad;
 872                 nblocks = (int)btodb(va.va_size);
 873                 if ((error =
 874                      VFS_STATVFS(vp->v_mount, &vp->v_mount->mnt_stat)) != 0)
 875                         goto bad;
 876
 877                 sdp->swd_bsize = vp->v_mount->mnt_stat.f_iosize;
 878                 /*
 879                  * limit the max # of outstanding I/O requests we issue
 880                  * at any one time.   take it easy on NFS servers.
 881                  */
 882 #ifdef NFS
 883                 if (vp->v_op == nfsv2_vnodeop_p)
 884                         sdp->swd_maxactive = 2; /* XXX */
 885                 else
 886 #endif /* NFS */
 887                         sdp->swd_maxactive = 8; /* XXX */
 888                 break;
 889
 890         default:
 891                 error = ENXIO;
 892                 goto bad;
 893         }
 894
 895         /*
 896          * save nblocks in a safe place and convert to pages.
 897          */
 898
 899         sdp->swd_nblks = nblocks;
 900         npages = dbtob((uint64_t)nblocks) >> PAGE_SHIFT;
 901
 902         /*
 903          * for block special files, we want to make sure that leave
 904          * the disklabel and bootblocks alone, so we arrange to skip
 905          * over them (arbitrarily choosing to skip PAGE_SIZE bytes).
 906          * note that because of this the "size" can be less than the
 907          * actual number of blocks on the device.
 908          */
 909         if (vp->v_type == VBLK) {
 910                 /* we use pages 1 to (size - 1) [inclusive] */
 911                 size = npages - 1;
 912                 addr = 1;
 913         } else {
 914                 /* we use pages 0 to (size - 1) [inclusive] */
 915                 size = npages;
 916                 addr = 0;
 917         }
 918
 919         /*
 920          * make sure we have enough blocks for a reasonable sized swap
 921          * area.   we want at least one page.
 922          */
 923
 924         if (size < 1) {
 925                 UVMHIST_LOG(pdhist, "  size <= 1!!", 0, 0, 0, 0);
 926                 error = EINVAL;
 927                 goto bad;
 928         }
 929
 930         UVMHIST_LOG(pdhist, "  dev=%x: size=%d addr=%ld\n", dev, size, addr, 0);
 931
 932         /*
 933          * now we need to allocate an extent to manage this swap device
 934          */
 935
 936         sdp->swd_blist = blist_create(npages);
 937         /* mark all expect the `saved' region free. */
 938         blist_free(sdp->swd_blist, addr, size);
 939
 940         /*
 941          * if the vnode we are swapping to is the root vnode
 942          * (i.e. we are swapping to the miniroot) then we want
 943          * to make sure we don't overwrite it.   do a statfs to
 944          * find its size and skip over it.
 945          */
 946         if (vp == rootvp) {
 947                 struct mount *mp;
 948                 struct statvfs *sp;
 949                 int rootblocks, rootpages;
 950
 951                 mp = rootvnode->v_mount;
 952                 sp = &mp->mnt_stat;
 953                 rootblocks = sp->f_blocks * btodb(sp->f_frsize);
 954                 /*
 955                  * XXX: sp->f_blocks isn't the total number of
 956                  * blocks in the filesystem, it's the number of
 957                  * data blocks.  so, our rootblocks almost
 958                  * definitely underestimates the total size
 959                  * of the filesystem - how badly depends on the
 960                  * details of the filesystem type.  there isn't
 961                  * an obvious way to deal with this cleanly
 962                  * and perfectly, so for now we just pad our
 963                  * rootblocks estimate with an extra 5 percent.
 964                  */
 965                 rootblocks += (rootblocks >> 5) +
 966                         (rootblocks >> 6) +
 967                         (rootblocks >> 7);
 968                 rootpages = round_page(dbtob(rootblocks)) >> PAGE_SHIFT;
 969                 if (rootpages > size)
 970                         panic("swap_on: miniroot larger than swap?");
 971
 972                 if (rootpages != blist_fill(sdp->swd_blist, addr, rootpages)) {
 973                         panic("swap_on: unable to preserve miniroot");
 974                 }
 975
 976                 size -= rootpages;
 977                 printf("Preserved %d pages of miniroot ", rootpages);
 978                 printf("leaving %d pages of swap\n", size);
 979         }
 980
 981         /*
 982          * add a ref to vp to reflect usage as a swap device.
 983          */
 984         vref(vp);
 985
 986         /*
 987          * now add the new swapdev to the drum and enable.
 988          */
 989         result = vmem_alloc(swapmap, npages, VM_BESTFIT | VM_SLEEP);
 990         if (result == 0)
 991                 panic("swapdrum_add");
 992         /*
 993          * If this is the first regular swap create the workqueue.
 994          * => Protected by swap_syscall_lock.
 995          */
 996         if (vp->v_type != VBLK) {
 997                 if (sw_reg_count++ == 0) {
 998                         KASSERT(sw_reg_workqueue == NULL);
 999                         if (workqueue_create(&sw_reg_workqueue, "swapiod",
1000                             sw_reg_iodone, NULL, PRIBIO, IPL_BIO, 0) != 0)
1001                                 panic("%s: workqueue_create failed", __func__);
1002                 }
1003         }
1004
1005         sdp->swd_drumoffset = (int)result;
1006         sdp->swd_drumsize = npages;
1007         sdp->swd_npages = size;
1008         mutex_enter(&uvm_swap_data_lock);
1009         sdp->swd_flags &= ~SWF_FAKE;    /* going live */
1010         sdp->swd_flags |= (SWF_INUSE|SWF_ENABLE);
1011         uvmexp.swpages += size;
1012         uvmexp.swpgavail += size;
1013         mutex_exit(&uvm_swap_data_lock);
1014         return (0);
1015
1016         /*
1017          * failure: clean up and return error.
1018          */
1019
1020 bad:
1021         if (sdp->swd_blist) {
1022                 blist_destroy(sdp->swd_blist);
1023         }
1024         if (vp != rootvp) {
1025                 (void)VOP_CLOSE(vp, FREAD|FWRITE, l->l_cred);
1026         }
1027         return (error);
1028 }
1029
1030 /*
1031  * swap_off: stop swapping on swapdev
1032  *
1033  * => swap data should be locked, we will unlock.
1034  */
1035 static int
1036 swap_off(struct lwp *l, struct swapdev *sdp)
1037 {
1038         int npages = sdp->swd_npages;
1039         int error = 0;
1040
1041         UVMHIST_FUNC("swap_off"); UVMHIST_CALLED(pdhist);
1042         UVMHIST_LOG(pdhist, "  dev=%x, npages=%d", sdp->swd_dev,npages,0,0);
1043
1044         /* disable the swap area being removed */
1045         sdp->swd_flags &= ~SWF_ENABLE;
1046         uvmexp.swpgavail -= npages;
1047         mutex_exit(&uvm_swap_data_lock);
1048
1049         /*
1050          * the idea is to find all the pages that are paged out to this
1051          * device, and page them all in.  in uvm, swap-backed pageable
1052          * memory can take two forms: aobjs and anons.  call the
1053          * swapoff hook for each subsystem to bring in pages.
1054          */
1055
1056         if (uao_swap_off(sdp->swd_drumoffset,
1057                          sdp->swd_drumoffset + sdp->swd_drumsize) ||
1058             amap_swap_off(sdp->swd_drumoffset,
1059                           sdp->swd_drumoffset + sdp->swd_drumsize)) {
1060                 error = ENOMEM;
1061         } else if (sdp->swd_npginuse > sdp->swd_npgbad) {
1062                 error = EBUSY;
1063         }
1064
1065         if (error) {
1066                 mutex_enter(&uvm_swap_data_lock);
1067                 sdp->swd_flags |= SWF_ENABLE;
1068                 uvmexp.swpgavail += npages;
1069                 mutex_exit(&uvm_swap_data_lock);
1070
1071                 return error;
1072         }
1073
1074         /*
1075          * If this is the last regular swap destroy the workqueue.
1076          * => Protected by swap_syscall_lock.
1077          */
1078         if (sdp->swd_vp->v_type != VBLK) {
1079                 KASSERT(sw_reg_count > 0);
1080                 KASSERT(sw_reg_workqueue != NULL);
1081                 if (--sw_reg_count == 0) {
1082                         workqueue_destroy(sw_reg_workqueue);
1083                         sw_reg_workqueue = NULL;
1084                 }
1085         }
1086
1087         /*
1088          * done with the vnode.
1089          * drop our ref on the vnode before calling VOP_CLOSE()
1090          * so that spec_close() can tell if this is the last close.
1091          */
1092         vrele(sdp->swd_vp);
1093         if (sdp->swd_vp != rootvp) {
1094                 (void) VOP_CLOSE(sdp->swd_vp, FREAD|FWRITE, l->l_cred);
1095         }
1096
1097         mutex_enter(&uvm_swap_data_lock);
1098         uvmexp.swpages -= npages;
1099         uvmexp.swpginuse -= sdp->swd_npgbad;
1100
1101         if (swaplist_find(sdp->swd_vp, true) == NULL)
1102                 panic("%s: swapdev not in list", __func__);
1103         swaplist_trim();
1104         mutex_exit(&uvm_swap_data_lock);
1105
1106         /*
1107          * free all resources!
1108          */
1109         vmem_free(swapmap, sdp->swd_drumoffset, sdp->swd_drumsize);
1110         blist_destroy(sdp->swd_blist);
1111         bufq_free(sdp->swd_tab);
1112         free(sdp, M_VMSWAP);
1113         return (0);
1114 }
1115
1116 /*
1117  * /dev/drum interface and i/o functions
1118  */
1119
1120 /*
1121  * swstrategy: perform I/O on the drum
1122  *
1123  * => we must map the i/o request from the drum to the correct swapdev.
1124  */
1125 static void
1126 swstrategy(struct buf *bp)
1127 {
1128         struct swapdev *sdp;
1129         struct vnode *vp;
1130         int pageno, bn;
1131         UVMHIST_FUNC("swstrategy"); UVMHIST_CALLED(pdhist);
1132
1133         /*
1134          * convert block number to swapdev.   note that swapdev can't
1135          * be yanked out from under us because we are holding resources
1136          * in it (i.e. the blocks we are doing I/O on).
1137          */
1138         pageno = dbtob((int64_t)bp->b_blkno) >> PAGE_SHIFT;
1139         mutex_enter(&uvm_swap_data_lock);
1140         sdp = swapdrum_getsdp(pageno);
1141         mutex_exit(&uvm_swap_data_lock);
1142         if (sdp == NULL) {
1143                 bp->b_error = EINVAL;
1144                 biodone(bp);
1145                 UVMHIST_LOG(pdhist, "  failed to get swap device", 0, 0, 0, 0);
1146                 return;
1147         }
1148
1149         /*
1150          * convert drum page number to block number on this swapdev.
1151          */
1152
1153         pageno -= sdp->swd_drumoffset;  /* page # on swapdev */
1154         bn = btodb((uint64_t)pageno << PAGE_SHIFT); /* convert to diskblock */
1155
1156         UVMHIST_LOG(pdhist, "  %s: mapoff=%x bn=%x bcount=%ld",
1157                 ((bp->b_flags & B_READ) == 0) ? "write" : "read",
1158                 sdp->swd_drumoffset, bn, bp->b_bcount);
1159
1160         /*
1161          * for block devices we finish up here.
1162          * for regular files we have to do more work which we delegate
1163          * to sw_reg_strategy().
1164          */
1165
1166         vp = sdp->swd_vp;               /* swapdev vnode pointer */
1167         switch (vp->v_type) {
1168         default:
1169                 panic("%s: vnode type 0x%x", __func__, vp->v_type);
1170
1171         case VBLK:
1172
1173                 /*
1174                  * must convert "bp" from an I/O on /dev/drum to an I/O
1175                  * on the swapdev (sdp).
1176                  */
1177                 bp->b_blkno = bn;               /* swapdev block number */
1178                 bp->b_dev = sdp->swd_dev;       /* swapdev dev_t */
1179
1180                 /*
1181                  * if we are doing a write, we have to redirect the i/o on
1182                  * drum's v_numoutput counter to the swapdevs.
1183                  */
1184                 if ((bp->b_flags & B_READ) == 0) {
1185                         mutex_enter(bp->b_objlock);
1186                         vwakeup(bp);    /* kills one 'v_numoutput' on drum */
1187                         mutex_exit(bp->b_objlock);
1188                         mutex_enter(&vp->v_interlock);
1189                         vp->v_numoutput++;      /* put it on swapdev */
1190                         mutex_exit(&vp->v_interlock);
1191                 }
1192
1193                 /*
1194                  * finally plug in swapdev vnode and start I/O
1195                  */
1196                 bp->b_vp = vp;
1197                 bp->b_objlock = &vp->v_interlock;
1198                 VOP_STRATEGY(vp, bp);
1199                 return;
1200
1201         case VREG:
1202                 /*
1203                  * delegate to sw_reg_strategy function.
1204                  */
1205                 sw_reg_strategy(sdp, bp, bn);
1206                 return;
1207         }
1208         /* NOTREACHED */
1209 }
1210
1211 /*
1212  * swread: the read function for the drum (just a call to physio)
1213  */
1214 /*ARGSUSED*/
1215 static int
1216 swread(dev_t dev, struct uio *uio, int ioflag)
1217 {
1218         UVMHIST_FUNC("swread"); UVMHIST_CALLED(pdhist);
1219
1220         UVMHIST_LOG(pdhist, "  dev=%x offset=%qx", dev, uio->uio_offset, 0, 0);
1221         return (physio(swstrategy, NULL, dev, B_READ, minphys, uio));
1222 }
1223
1224 /*
1225  * swwrite: the write function for the drum (just a call to physio)
1226  */
1227 /*ARGSUSED*/
1228 static int
1229 swwrite(dev_t dev, struct uio *uio, int ioflag)
1230 {
1231         UVMHIST_FUNC("swwrite"); UVMHIST_CALLED(pdhist);
1232
1233         UVMHIST_LOG(pdhist, "  dev=%x offset=%qx", dev, uio->uio_offset, 0, 0);
1234         return (physio(swstrategy, NULL, dev, B_WRITE, minphys, uio));
1235 }
1236
1237 const struct bdevsw swap_bdevsw = {
1238         nullopen, nullclose, swstrategy, noioctl, nodump, nosize, D_OTHER,
1239 };
1240
1241 const struct cdevsw swap_cdevsw = {
1242         nullopen, nullclose, swread, swwrite, noioctl,
1243         nostop, notty, nopoll, nommap, nokqfilter, D_OTHER,
1244 };
1245
1246 /*
1247  * sw_reg_strategy: handle swap i/o to regular files
1248  */
1249 static void
1250 sw_reg_strategy(struct swapdev *sdp, struct buf *bp, int bn)
1251 {
1252         struct vnode    *vp;
1253         struct vndxfer  *vnx;
1254         daddr_t         nbn;
1255         char            *addr;
1256         off_t           byteoff;
1257         int             s, off, nra, error, sz, resid;
1258         UVMHIST_FUNC("sw_reg_strategy"); UVMHIST_CALLED(pdhist);
1259
1260         /*
1261          * allocate a vndxfer head for this transfer and point it to
1262          * our buffer.
1263          */
1264         vnx = pool_get(&vndxfer_pool, PR_WAITOK);
1265         vnx->vx_flags = VX_BUSY;
1266         vnx->vx_error = 0;
1267         vnx->vx_pending = 0;
1268         vnx->vx_bp = bp;
1269         vnx->vx_sdp = sdp;
1270
1271         /*
1272          * setup for main loop where we read filesystem blocks into
1273          * our buffer.
1274          */
1275         error = 0;
1276         bp->b_resid = bp->b_bcount;     /* nothing transfered yet! */
1277         addr = bp->b_data;              /* current position in buffer */
1278         byteoff = dbtob((uint64_t)bn);
1279
1280         for (resid = bp->b_resid; resid; resid -= sz) {
1281                 struct vndbuf   *nbp;
1282
1283                 /*
1284                  * translate byteoffset into block number.  return values:
1285                  *   vp = vnode of underlying device
1286                  *  nbn = new block number (on underlying vnode dev)
1287                  *  nra = num blocks we can read-ahead (excludes requested
1288                  *      block)
1289                  */
1290                 nra = 0;
1291                 error = VOP_BMAP(sdp->swd_vp, byteoff / sdp->swd_bsize,
1292                                         &vp, &nbn, &nra);
1293
1294                 if (error == 0 && nbn == (daddr_t)-1) {
1295                         /*
1296                          * this used to just set error, but that doesn't
1297                          * do the right thing.  Instead, it causes random
1298                          * memory errors.  The panic() should remain until
1299                          * this condition doesn't destabilize the system.
1300                          */
1301 #if 1
1302                         panic("%s: swap to sparse file", __func__);
1303 #else
1304                         error = EIO;    /* failure */
1305 #endif
1306                 }
1307
1308                 /*
1309                  * punt if there was an error or a hole in the file.
1310                  * we must wait for any i/o ops we have already started
1311                  * to finish before returning.
1312                  *
1313                  * XXX we could deal with holes here but it would be
1314                  * a hassle (in the write case).
1315                  */
1316                 if (error) {
1317                         s = splbio();
1318                         vnx->vx_error = error;  /* pass error up */
1319                         goto out;
1320                 }
1321
1322                 /*
1323                  * compute the size ("sz") of this transfer (in bytes).
1324                  */
1325                 off = byteoff % sdp->swd_bsize;
1326                 sz = (1 + nra) * sdp->swd_bsize - off;
1327                 if (sz > resid)
1328                         sz = resid;
1329
1330                 UVMHIST_LOG(pdhist, "sw_reg_strategy: "
1331                             "vp %p/%p offset 0x%x/0x%x",
1332                             sdp->swd_vp, vp, byteoff, nbn);
1333
1334                 /*
1335                  * now get a buf structure.   note that the vb_buf is
1336                  * at the front of the nbp structure so that you can
1337                  * cast pointers between the two structure easily.
1338                  */
1339                 nbp = pool_get(&vndbuf_pool, PR_WAITOK);
1340                 buf_init(&nbp->vb_buf);
1341                 nbp->vb_buf.b_flags    = bp->b_flags;
1342                 nbp->vb_buf.b_cflags   = bp->b_cflags;
1343                 nbp->vb_buf.b_oflags   = bp->b_oflags;
1344                 nbp->vb_buf.b_bcount   = sz;
1345                 nbp->vb_buf.b_bufsize  = sz;
1346                 nbp->vb_buf.b_error    = 0;
1347                 nbp->vb_buf.b_data     = addr;
1348                 nbp->vb_buf.b_lblkno   = 0;
1349                 nbp->vb_buf.b_blkno    = nbn + btodb(off);
1350                 nbp->vb_buf.b_rawblkno = nbp->vb_buf.b_blkno;
1351                 nbp->vb_buf.b_iodone   = sw_reg_biodone;
1352                 nbp->vb_buf.b_vp       = vp;
1353                 nbp->vb_buf.b_objlock  = &vp->v_interlock;
1354                 if (vp->v_type == VBLK) {
1355                         nbp->vb_buf.b_dev = vp->v_rdev;
1356                 }
1357
1358                 nbp->vb_xfer = vnx;     /* patch it back in to vnx */
1359
1360                 /*
1361                  * Just sort by block number
1362                  */
1363                 s = splbio();
1364                 if (vnx->vx_error != 0) {
1365                         buf_destroy(&nbp->vb_buf);
1366                         pool_put(&vndbuf_pool, nbp);
1367                         goto out;
1368                 }
1369                 vnx->vx_pending++;
1370
1371                 /* sort it in and start I/O if we are not over our limit */
1372                 /* XXXAD locking */
1373                 bufq_put(sdp->swd_tab, &nbp->vb_buf);
1374                 sw_reg_start(sdp);
1375                 splx(s);
1376
1377                 /*
1378                  * advance to the next I/O
1379                  */
1380                 byteoff += sz;
1381                 addr += sz;
1382         }
1383
1384         s = splbio();
1385
1386 out: /* Arrive here at splbio */
1387         vnx->vx_flags &= ~VX_BUSY;
1388         if (vnx->vx_pending == 0) {
1389                 error = vnx->vx_error;
1390                 pool_put(&vndxfer_pool, vnx);
1391                 bp->b_error = error;
1392                 biodone(bp);
1393         }
1394         splx(s);
1395 }
1396
1397 /*
1398  * sw_reg_start: start an I/O request on the requested swapdev
1399  *
1400  * => reqs are sorted by b_rawblkno (above)
1401  */
1402 static void
1403 sw_reg_start(struct swapdev *sdp)
1404 {
1405         struct buf      *bp;
1406         struct vnode    *vp;
1407         UVMHIST_FUNC("sw_reg_start"); UVMHIST_CALLED(pdhist);
1408
1409         /* recursion control */
1410         if ((sdp->swd_flags & SWF_BUSY) != 0)
1411                 return;
1412
1413         sdp->swd_flags |= SWF_BUSY;
1414
1415         while (sdp->swd_active < sdp->swd_maxactive) {
1416                 bp = bufq_get(sdp->swd_tab);
1417                 if (bp == NULL)
1418                         break;
1419                 sdp->swd_active++;
1420
1421                 UVMHIST_LOG(pdhist,
1422                     "sw_reg_start:  bp %p vp %p blkno %p cnt %lx",
1423                     bp, bp->b_vp, bp->b_blkno, bp->b_bcount);
1424                 vp = bp->b_vp;
1425                 KASSERT(bp->b_objlock == &vp->v_interlock);
1426                 if ((bp->b_flags & B_READ) == 0) {
1427                         mutex_enter(&vp->v_interlock);
1428                         vp->v_numoutput++;
1429                         mutex_exit(&vp->v_interlock);
1430                 }
1431                 VOP_STRATEGY(vp, bp);
1432         }
1433         sdp->swd_flags &= ~SWF_BUSY;
1434 }
1435
1436 /*
1437  * sw_reg_biodone: one of our i/o's has completed
1438  */
1439 static void
1440 sw_reg_biodone(struct buf *bp)
1441 {
1442         workqueue_enqueue(sw_reg_workqueue, &bp->b_work, NULL);
1443 }
1444
1445 /*
1446  * sw_reg_iodone: one of our i/o's has completed and needs post-i/o cleanup
1447  *
1448  * => note that we can recover the vndbuf struct by casting the buf ptr
1449  */
1450 static void
1451 sw_reg_iodone(struct work *wk, void *dummy)
1452 {
1453         struct vndbuf *vbp = (void *)wk;
1454         struct vndxfer *vnx = vbp->vb_xfer;
1455         struct buf *pbp = vnx->vx_bp;           /* parent buffer */
1456         struct swapdev  *sdp = vnx->vx_sdp;
1457         int s, resid, error;
1458         KASSERT(&vbp->vb_buf.b_work == wk);
1459         UVMHIST_FUNC("sw_reg_iodone"); UVMHIST_CALLED(pdhist);
1460
1461         UVMHIST_LOG(pdhist, "  vbp=%p vp=%p blkno=%x addr=%p",
1462             vbp, vbp->vb_buf.b_vp, vbp->vb_buf.b_blkno, vbp->vb_buf.b_data);
1463         UVMHIST_LOG(pdhist, "  cnt=%lx resid=%lx",
1464             vbp->vb_buf.b_bcount, vbp->vb_buf.b_resid, 0, 0);
1465
1466         /*
1467          * protect vbp at splbio and update.
1468          */
1469
1470         s = splbio();
1471         resid = vbp->vb_buf.b_bcount - vbp->vb_buf.b_resid;
1472         pbp->b_resid -= resid;
1473         vnx->vx_pending--;
1474
1475         if (vbp->vb_buf.b_error != 0) {
1476                 /* pass error upward */
1477                 error = vbp->vb_buf.b_error ? vbp->vb_buf.b_error : EIO;
1478                 UVMHIST_LOG(pdhist, "  got error=%d !", error, 0, 0, 0);
1479                 vnx->vx_error = error;
1480         }
1481
1482         /*
1483          * kill vbp structure
1484          */
1485         buf_destroy(&vbp->vb_buf);
1486         pool_put(&vndbuf_pool, vbp);
1487
1488         /*
1489          * wrap up this transaction if it has run to completion or, in
1490          * case of an error, when all auxiliary buffers have returned.
1491          */
1492         if (vnx->vx_error != 0) {
1493                 /* pass error upward */
1494                 error = vnx->vx_error;
1495                 if ((vnx->vx_flags & VX_BUSY) == 0 && vnx->vx_pending == 0) {
1496                         pbp->b_error = error;
1497                         biodone(pbp);
1498                         pool_put(&vndxfer_pool, vnx);
1499                 }
1500         } else if (pbp->b_resid == 0) {
1501                 KASSERT(vnx->vx_pending == 0);
1502                 if ((vnx->vx_flags & VX_BUSY) == 0) {
1503                         UVMHIST_LOG(pdhist, "  iodone error=%d !",
1504                             pbp, vnx->vx_error, 0, 0);
1505                         biodone(pbp);
1506                         pool_put(&vndxfer_pool, vnx);
1507                 }
1508         }
1509
1510         /*
1511          * done!   start next swapdev I/O if one is pending
1512          */
1513         sdp->swd_active--;
1514         sw_reg_start(sdp);
1515         splx(s);
1516 }
1517
1518
1519 /*
1520  * uvm_swap_alloc: allocate space on swap
1521  *
1522  * => allocation is done "round robin" down the priority list, as we
1523  *      allocate in a priority we "rotate" the circle queue.
1524  * => space can be freed with uvm_swap_free
1525  * => we return the page slot number in /dev/drum (0 == invalid slot)
1526  * => we lock uvm_swap_data_lock
1527  * => XXXMRG: "LESSOK" INTERFACE NEEDED TO EXTENT SYSTEM
1528  */
1529 int
1530 uvm_swap_alloc(int *nslots /* IN/OUT */, bool lessok)
1531 {
1532         struct swapdev *sdp;
1533         struct swappri *spp;
1534         UVMHIST_FUNC("uvm_swap_alloc"); UVMHIST_CALLED(pdhist);
1535
1536         /*
1537          * no swap devices configured yet?   definite failure.
1538          */
1539         if (uvmexp.nswapdev < 1)
1540                 return 0;
1541
1542         /*
1543          * lock data lock, convert slots into blocks, and enter loop
1544          */
1545         mutex_enter(&uvm_swap_data_lock);
1546
1547 ReTry:  /* XXXMRG */
1548         LIST_FOREACH(spp, &swap_priority, spi_swappri) {
1549                 CIRCLEQ_FOREACH(sdp, &spp->spi_swapdev, swd_next) {
1550                         uint64_t result;
1551
1552                         /* if it's not enabled, then we can't swap from it */
1553                         if ((sdp->swd_flags & SWF_ENABLE) == 0)
1554                                 continue;
1555                         if (sdp->swd_npginuse + *nslots > sdp->swd_npages)
1556                                 continue;
1557                         result = blist_alloc(sdp->swd_blist, *nslots);
1558                         if (result == BLIST_NONE) {
1559                                 continue;
1560                         }
1561                         KASSERT(result < sdp->swd_drumsize);
1562
1563                         /*
1564                          * successful allocation!  now rotate the circleq.
1565                          */
1566                         CIRCLEQ_REMOVE(&spp->spi_swapdev, sdp, swd_next);
1567                         CIRCLEQ_INSERT_TAIL(&spp->spi_swapdev, sdp, swd_next);
1568                         sdp->swd_npginuse += *nslots;
1569                         uvmexp.swpginuse += *nslots;
1570                         mutex_exit(&uvm_swap_data_lock);
1571                         /* done!  return drum slot number */
1572                         UVMHIST_LOG(pdhist,
1573                             "success!  returning %d slots starting at %d",
1574                             *nslots, result + sdp->swd_drumoffset, 0, 0);
1575                         return (result + sdp->swd_drumoffset);
1576                 }
1577         }
1578
1579         /* XXXMRG: BEGIN HACK */
1580         if (*nslots > 1 && lessok) {
1581                 *nslots = 1;
1582                 /* XXXMRG: ugh!  blist should support this for us */
1583                 goto ReTry;
1584         }
1585         /* XXXMRG: END HACK */
1586
1587         mutex_exit(&uvm_swap_data_lock);
1588         return 0;
1589 }
1590
1591 /*
1592  * uvm_swapisfull: return true if most of available swap is allocated
1593  * and in use.  we don't count some small portion as it may be inaccessible
1594  * to us at any given moment, for example if there is lock contention or if
1595  * pages are busy.
1596  */
1597 bool
1598 uvm_swapisfull(void)
1599 {
1600         int swpgonly;
1601         bool rv;
1602
1603         mutex_enter(&uvm_swap_data_lock);
1604         KASSERT(uvmexp.swpgonly <= uvmexp.swpages);
1605         swpgonly = (int)((uint64_t)uvmexp.swpgonly * 100 /
1606             uvm_swapisfull_factor);
1607         rv = (swpgonly >= uvmexp.swpgavail);
1608         mutex_exit(&uvm_swap_data_lock);
1609
1610         return (rv);
1611 }
1612
1613 /*
1614  * uvm_swap_markbad: keep track of swap ranges where we've had i/o errors
1615  *
1616  * => we lock uvm_swap_data_lock
1617  */
1618 void
1619 uvm_swap_markbad(int startslot, int nslots)
1620 {
1621         struct swapdev *sdp;
1622         UVMHIST_FUNC("uvm_swap_markbad"); UVMHIST_CALLED(pdhist);
1623
1624         mutex_enter(&uvm_swap_data_lock);
1625         sdp = swapdrum_getsdp(startslot);
1626         KASSERT(sdp != NULL);
1627
1628         /*
1629          * we just keep track of how many pages have been marked bad
1630          * in this device, to make everything add up in swap_off().
1631          * we assume here that the range of slots will all be within
1632          * one swap device.
1633          */
1634
1635         KASSERT(uvmexp.swpgonly >= nslots);
1636         uvmexp.swpgonly -= nslots;
1637         sdp->swd_npgbad += nslots;
1638         UVMHIST_LOG(pdhist, "now %d bad", sdp->swd_npgbad, 0,0,0);
1639         mutex_exit(&uvm_swap_data_lock);
1640 }
1641
1642 /*
1643  * uvm_swap_free: free swap slots
1644  *
1645  * => this can be all or part of an allocation made by uvm_swap_alloc
1646  * => we lock uvm_swap_data_lock
1647  */
1648 void
1649 uvm_swap_free(int startslot, int nslots)
1650 {
1651         struct swapdev *sdp;
1652         UVMHIST_FUNC("uvm_swap_free"); UVMHIST_CALLED(pdhist);
1653
1654         UVMHIST_LOG(pdhist, "freeing %d slots starting at %d", nslots,
1655             startslot, 0, 0);
1656
1657         /*
1658          * ignore attempts to free the "bad" slot.
1659          */
1660
1661         if (startslot == SWSLOT_BAD) {
1662                 return;
1663         }
1664
1665         /*
1666          * convert drum slot offset back to sdp, free the blocks
1667          * in the extent, and return.   must hold pri lock to do
1668          * lookup and access the extent.
1669          */
1670
1671         mutex_enter(&uvm_swap_data_lock);
1672         sdp = swapdrum_getsdp(startslot);
1673         KASSERT(uvmexp.nswapdev >= 1);
1674         KASSERT(sdp != NULL);
1675         KASSERT(sdp->swd_npginuse >= nslots);
1676         blist_free(sdp->swd_blist, startslot - sdp->swd_drumoffset, nslots);
1677         sdp->swd_npginuse -= nslots;
1678         uvmexp.swpginuse -= nslots;
1679         mutex_exit(&uvm_swap_data_lock);
1680 }
1681
1682 /*
1683  * uvm_swap_put: put any number of pages into a contig place on swap
1684  *
1685  * => can be sync or async
1686  */
1687
1688 int
1689 uvm_swap_put(int swslot, struct vm_page **ppsp, int npages, int flags)
1690 {
1691         int error;
1692
1693         error = uvm_swap_io(ppsp, swslot, npages, B_WRITE |
1694             ((flags & PGO_SYNCIO) ? 0 : B_ASYNC));
1695         return error;
1696 }
1697
1698 /*
1699  * uvm_swap_get: get a single page from swap
1700  *
1701  * => usually a sync op (from fault)
1702  */
1703
1704 int
1705 uvm_swap_get(struct vm_page *page, int swslot, int flags)
1706 {
1707         int error;
1708
1709         uvmexp.nswget++;
1710         KASSERT(flags & PGO_SYNCIO);
1711         if (swslot == SWSLOT_BAD) {
1712                 return EIO;
1713         }
1714
1715         error = uvm_swap_io(&page, swslot, 1, B_READ |
1716             ((flags & PGO_SYNCIO) ? 0 : B_ASYNC));
1717         if (error == 0) {
1718
1719                 /*
1720                  * this page is no longer only in swap.
1721                  */
1722
1723                 mutex_enter(&uvm_swap_data_lock);
1724                 KASSERT(uvmexp.swpgonly > 0);
1725                 uvmexp.swpgonly--;
1726                 mutex_exit(&uvm_swap_data_lock);
1727         }
1728         return error;
1729 }
1730
1731 /*
1732  * uvm_swap_io: do an i/o operation to swap
1733  */
1734
1735 static int
1736 uvm_swap_io(struct vm_page **pps, int startslot, int npages, int flags)
1737 {
1738         daddr_t startblk;
1739         struct  buf *bp;
1740         vaddr_t kva;
1741         int     error, mapinflags;
1742         bool write, async;
1743         UVMHIST_FUNC("uvm_swap_io"); UVMHIST_CALLED(pdhist);
1744
1745         UVMHIST_LOG(pdhist, "<- called, startslot=%d, npages=%d, flags=%d",
1746             startslot, npages, flags, 0);
1747
1748         write = (flags & B_READ) == 0;
1749         async = (flags & B_ASYNC) != 0;
1750
1751         /*
1752          * allocate a buf for the i/o.
1753          */
1754
1755         KASSERT(curlwp != uvm.pagedaemon_lwp || (write && async));
1756         bp = getiobuf(swapdev_vp, curlwp != uvm.pagedaemon_lwp);
1757         if (bp == NULL) {
1758                 uvm_aio_aiodone_pages(pps, npages, true, ENOMEM);
1759                 return ENOMEM;
1760         }
1761
1762         /*
1763          * convert starting drum slot to block number
1764          */
1765
1766         startblk = btodb((uint64_t)startslot << PAGE_SHIFT);
1767
1768         /*
1769          * first, map the pages into the kernel.
1770          */
1771
1772         mapinflags = !write ?
1773                 UVMPAGER_MAPIN_WAITOK|UVMPAGER_MAPIN_READ :
1774                 UVMPAGER_MAPIN_WAITOK|UVMPAGER_MAPIN_WRITE;
1775         kva = uvm_pagermapin(pps, npages, mapinflags);
1776
1777         /*
1778          * fill in the bp/sbp.   we currently route our i/o through
1779          * /dev/drum's vnode [swapdev_vp].
1780          */
1781
1782         bp->b_cflags = BC_BUSY | BC_NOCACHE;
1783         bp->b_flags = (flags & (B_READ|B_ASYNC));
1784         bp->b_proc = &proc0;    /* XXX */
1785         bp->b_vnbufs.le_next = NOLIST;
1786         bp->b_data = (void *)kva;
1787         bp->b_blkno = startblk;
1788         bp->b_bufsize = bp->b_bcount = npages << PAGE_SHIFT;
1789
1790         /*
1791          * bump v_numoutput (counter of number of active outputs).
1792          */
1793
1794         if (write) {
1795                 mutex_enter(&swapdev_vp->v_interlock);
1796                 swapdev_vp->v_numoutput++;
1797                 mutex_exit(&swapdev_vp->v_interlock);
1798         }
1799
1800         /*
1801          * for async ops we must set up the iodone handler.
1802          */
1803
1804         if (async) {
1805                 bp->b_iodone = uvm_aio_biodone;
1806                 UVMHIST_LOG(pdhist, "doing async!", 0, 0, 0, 0);
1807                 if (curlwp == uvm.pagedaemon_lwp)
1808                         BIO_SETPRIO(bp, BPRIO_TIMECRITICAL);
1809                 else
1810                         BIO_SETPRIO(bp, BPRIO_TIMELIMITED);
1811         } else {
1812                 bp->b_iodone = NULL;
1813                 BIO_SETPRIO(bp, BPRIO_TIMECRITICAL);
1814         }
1815         UVMHIST_LOG(pdhist,
1816             "about to start io: data = %p blkno = 0x%x, bcount = %ld",
1817             bp->b_data, bp->b_blkno, bp->b_bcount, 0);
1818
1819         /*
1820          * now we start the I/O, and if async, return.
1821          */
1822
1823         VOP_STRATEGY(swapdev_vp, bp);
1824         if (async)
1825                 return 0;
1826
1827         /*
1828          * must be sync i/o.   wait for it to finish
1829          */
1830
1831         error = biowait(bp);
1832
1833         /*
1834          * kill the pager mapping
1835          */
1836
1837         uvm_pagermapout(kva, npages);
1838
1839         /*
1840          * now dispose of the buf and we're done.
1841          */
1842
1843         if (write) {
1844                 mutex_enter(&swapdev_vp->v_interlock);
1845                 vwakeup(bp);
1846                 mutex_exit(&swapdev_vp->v_interlock);
1847         }
1848         putiobuf(bp);
1849         UVMHIST_LOG(pdhist, "<- done (sync)  error=%d", error, 0, 0, 0);
1850
1851         return (error);
1852 }