sys/kern/vfs_bio.c

   1 /*      $NetBSD: vfs_bio.c,v 1.221 2009/11/11 09:15:42 rmind Exp $      */
   2
   3 /*-
   4  * Copyright (c) 2007, 2008, 2009 The NetBSD Foundation, Inc.
   5  * All rights reserved.
   6  *
   7  * This code is derived from software contributed to The NetBSD Foundation
   8  * by Andrew Doran, and by Wasabi Systems, Inc.
   9  *
  10  * Redistribution and use in source and binary forms, with or without
  11  * modification, are permitted provided that the following conditions
  12  * are met:
  13  * 1. Redistributions of source code must retain the above copyright
  14  *    notice, this list of conditions and the following disclaimer.
  15  * 2. Redistributions in binary form must reproduce the above copyright
  16  *    notice, this list of conditions and the following disclaimer in the
  17  *    documentation and/or other materials provided with the distribution.
  18  *
  19  * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
  20  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
  21  * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
  22  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
  23  * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
  24  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
  25  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
  26  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
  27  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
  28  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
  29  * POSSIBILITY OF SUCH DAMAGE.
  30  */
  31
  32 /*-
  33  * Copyright (c) 1982, 1986, 1989, 1993
  34  *      The Regents of the University of California.  All rights reserved.
  35  * (c) UNIX System Laboratories, Inc.
  36  * All or some portions of this file are derived from material licensed
  37  * to the University of California by American Telephone and Telegraph
  38  * Co. or Unix System Laboratories, Inc. and are reproduced herein with
  39  * the permission of UNIX System Laboratories, Inc.
  40  *
  41  * Redistribution and use in source and binary forms, with or without
  42  * modification, are permitted provided that the following conditions
  43  * are met:
  44  * 1. Redistributions of source code must retain the above copyright
  45  *    notice, this list of conditions and the following disclaimer.
  46  * 2. Redistributions in binary form must reproduce the above copyright
  47  *    notice, this list of conditions and the following disclaimer in the
  48  *    documentation and/or other materials provided with the distribution.
  49  * 3. Neither the name of the University nor the names of its contributors
  50  *    may be used to endorse or promote products derived from this software
  51  *    without specific prior written permission.
  52  *
  53  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  54  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  55  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  56  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  57  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  58  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  59  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  60  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  61  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  62  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  63  * SUCH DAMAGE.
  64  *
  65  *      @(#)vfs_bio.c   8.6 (Berkeley) 1/11/94
  66  */
  67
  68 /*-
  69  * Copyright (c) 1994 Christopher G. Demetriou
  70  *
  71  * Redistribution and use in source and binary forms, with or without
  72  * modification, are permitted provided that the following conditions
  73  * are met:
  74  * 1. Redistributions of source code must retain the above copyright
  75  *    notice, this list of conditions and the following disclaimer.
  76  * 2. Redistributions in binary form must reproduce the above copyright
  77  *    notice, this list of conditions and the following disclaimer in the
  78  *    documentation and/or other materials provided with the distribution.
  79  * 3. All advertising materials mentioning features or use of this software
  80  *    must display the following acknowledgement:
  81  *      This product includes software developed by the University of
  82  *      California, Berkeley and its contributors.
  83  * 4. Neither the name of the University nor the names of its contributors
  84  *    may be used to endorse or promote products derived from this software
  85  *    without specific prior written permission.
  86  *
  87  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  88  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  89  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  90  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  91  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  92  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  93  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  94  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  95  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  96  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  97  * SUCH DAMAGE.
  98  *
  99  *      @(#)vfs_bio.c   8.6 (Berkeley) 1/11/94
 100  */
 101
 102 /*
 103  * The buffer cache subsystem.
 104  *
 105  * Some references:
 106  *      Bach: The Design of the UNIX Operating System (Prentice Hall, 1986)
 107  *      Leffler, et al.: The Design and Implementation of the 4.3BSD
 108  *              UNIX Operating System (Addison Welley, 1989)
 109  *
 110  * Locking
 111  *
 112  * There are three locks:
 113  * - bufcache_lock: protects global buffer cache state.
 114  * - BC_BUSY: a long term per-buffer lock.
 115  * - buf_t::b_objlock: lock on completion (biowait vs biodone).
 116  *
 117  * For buffers associated with vnodes (a most common case) b_objlock points
 118  * to the vnode_t::v_interlock.  Otherwise, it points to generic buffer_lock.
 119  *
 120  * Lock order:
 121  *      bufcache_lock ->
 122  *              buf_t::b_objlock
 123  */
 124
 125 #include <sys/cdefs.h>
 126 __KERNEL_RCSID(0, "$NetBSD: vfs_bio.c,v 1.221 2009/11/11 09:15:42 rmind Exp $");
 127
 128 #include "fs_ffs.h"
 129 #include "opt_bufcache.h"
 130
 131 #include <sys/param.h>
 132 #include <sys/systm.h>
 133 #include <sys/kernel.h>
 134 #include <sys/proc.h>
 135 #include <sys/buf.h>
 136 #include <sys/vnode.h>
 137 #include <sys/mount.h>
 138 #include <sys/resourcevar.h>
 139 #include <sys/sysctl.h>
 140 #include <sys/conf.h>
 141 #include <sys/kauth.h>
 142 #include <sys/fstrans.h>
 143 #include <sys/intr.h>
 144 #include <sys/cpu.h>
 145 #include <sys/wapbl.h>
 146
 147 #include <uvm/uvm.h>
 148
 149 #include <miscfs/specfs/specdev.h>
 150
 151 #ifndef BUFPAGES
 152 # define BUFPAGES 0
 153 #endif
 154
 155 #ifdef BUFCACHE
 156 # if (BUFCACHE < 5) || (BUFCACHE > 95)
 157 #  error BUFCACHE is not between 5 and 95
 158 # endif
 159 #else
 160 # define BUFCACHE 15
 161 #endif
 162
 163 u_int   nbuf;                   /* desired number of buffer headers */
 164 u_int   bufpages = BUFPAGES;    /* optional hardwired count */
 165 u_int   bufcache = BUFCACHE;    /* max % of RAM to use for buffer cache */
 166
 167 /* Function prototypes */
 168 struct bqueue;
 169
 170 static void buf_setwm(void);
 171 static int buf_trim(void);
 172 static void *bufpool_page_alloc(struct pool *, int);
 173 static void bufpool_page_free(struct pool *, void *);
 174 static buf_t *bio_doread(struct vnode *, daddr_t, int,
 175     kauth_cred_t, int);
 176 static buf_t *getnewbuf(int, int, int);
 177 static int buf_lotsfree(void);
 178 static int buf_canrelease(void);
 179 static u_long buf_mempoolidx(u_long);
 180 static u_long buf_roundsize(u_long);
 181 static void *buf_malloc(size_t);
 182 static void buf_mrelease(void *, size_t);
 183 static void binsheadfree(buf_t *, struct bqueue *);
 184 static void binstailfree(buf_t *, struct bqueue *);
 185 int count_lock_queue(void); /* XXX */
 186 #ifdef DEBUG
 187 static int checkfreelist(buf_t *, struct bqueue *, int);
 188 #endif
 189 static void biointr(void *);
 190 static void biodone2(buf_t *);
 191 static void bref(buf_t *);
 192 static void brele(buf_t *);
 193 static void sysctl_kern_buf_setup(void);
 194 static void sysctl_vm_buf_setup(void);
 195
 196 /*
 197  * Definitions for the buffer hash lists.
 198  */
 199 #define BUFHASH(dvp, lbn)       \
 200         (&bufhashtbl[(((long)(dvp) >> 8) + (int)(lbn)) & bufhash])
 201 LIST_HEAD(bufhashhdr, buf) *bufhashtbl, invalhash;
 202 u_long  bufhash;
 203 struct bqueue bufqueues[BQUEUES];
 204
 205 static kcondvar_t needbuffer_cv;
 206
 207 /*
 208  * Buffer queue lock.
 209  */
 210 kmutex_t bufcache_lock;
 211 kmutex_t buffer_lock;
 212
 213 /* Software ISR for completed transfers. */
 214 static void *biodone_sih;
 215
 216 /* Buffer pool for I/O buffers. */
 217 static pool_cache_t buf_cache;
 218 static pool_cache_t bufio_cache;
 219
 220 /* XXX - somewhat gross.. */
 221 #if MAXBSIZE == 0x2000
 222 #define NMEMPOOLS 5
 223 #elif MAXBSIZE == 0x4000
 224 #define NMEMPOOLS 6
 225 #elif MAXBSIZE == 0x8000
 226 #define NMEMPOOLS 7
 227 #else
 228 #define NMEMPOOLS 8
 229 #endif
 230
 231 #define MEMPOOL_INDEX_OFFSET 9  /* smallest pool is 512 bytes */
 232 #if (1 << (NMEMPOOLS + MEMPOOL_INDEX_OFFSET - 1)) != MAXBSIZE
 233 #error update vfs_bio buffer memory parameters
 234 #endif
 235
 236 /* Buffer memory pools */
 237 static struct pool bmempools[NMEMPOOLS];
 238
 239 static struct vm_map *buf_map;
 240
 241 /*
 242  * Buffer memory pool allocator.
 243  */
 244 static void *
 245 bufpool_page_alloc(struct pool *pp, int flags)
 246 {
 247
 248         return (void *)uvm_km_alloc(buf_map,
 249             MAXBSIZE, MAXBSIZE,
 250             ((flags & PR_WAITOK) ? 0 : UVM_KMF_NOWAIT | UVM_KMF_TRYLOCK)
 251             | UVM_KMF_WIRED);
 252 }
 253
 254 static void
 255 bufpool_page_free(struct pool *pp, void *v)
 256 {
 257
 258         uvm_km_free(buf_map, (vaddr_t)v, MAXBSIZE, UVM_KMF_WIRED);
 259 }
 260
 261 static struct pool_allocator bufmempool_allocator = {
 262         .pa_alloc = bufpool_page_alloc,
 263         .pa_free = bufpool_page_free,
 264         .pa_pagesz = MAXBSIZE,
 265 };
 266
 267 /* Buffer memory management variables */
 268 u_long bufmem_valimit;
 269 u_long bufmem_hiwater;
 270 u_long bufmem_lowater;
 271 u_long bufmem;
 272
 273 /*
 274  * MD code can call this to set a hard limit on the amount
 275  * of virtual memory used by the buffer cache.
 276  */
 277 int
 278 buf_setvalimit(vsize_t sz)
 279 {
 280
 281         /* We need to accommodate at least NMEMPOOLS of MAXBSIZE each */
 282         if (sz < NMEMPOOLS * MAXBSIZE)
 283                 return EINVAL;
 284
 285         bufmem_valimit = sz;
 286         return 0;
 287 }
 288
 289 static void
 290 buf_setwm(void)
 291 {
 292
 293         bufmem_hiwater = buf_memcalc();
 294         /* lowater is approx. 2% of memory (with bufcache = 15) */
 295 #define BUFMEM_WMSHIFT  3
 296 #define BUFMEM_HIWMMIN  (64 * 1024 << BUFMEM_WMSHIFT)
 297         if (bufmem_hiwater < BUFMEM_HIWMMIN)
 298                 /* Ensure a reasonable minimum value */
 299                 bufmem_hiwater = BUFMEM_HIWMMIN;
 300         bufmem_lowater = bufmem_hiwater >> BUFMEM_WMSHIFT;
 301 }
 302
 303 #ifdef DEBUG
 304 int debug_verify_freelist = 0;
 305 static int
 306 checkfreelist(buf_t *bp, struct bqueue *dp, int ison)
 307 {
 308         buf_t *b;
 309
 310         if (!debug_verify_freelist)
 311                 return 1;
 312
 313         TAILQ_FOREACH(b, &dp->bq_queue, b_freelist) {
 314                 if (b == bp)
 315                         return ison ? 1 : 0;
 316         }
 317
 318         return ison ? 0 : 1;
 319 }
 320 #endif
 321
 322 /*
 323  * Insq/Remq for the buffer hash lists.
 324  * Call with buffer queue locked.
 325  */
 326 static void
 327 binsheadfree(buf_t *bp, struct bqueue *dp)
 328 {
 329
 330         KASSERT(mutex_owned(&bufcache_lock));
 331         KASSERT(bp->b_freelistindex == -1);
 332         TAILQ_INSERT_HEAD(&dp->bq_queue, bp, b_freelist);
 333         dp->bq_bytes += bp->b_bufsize;
 334         bp->b_freelistindex = dp - bufqueues;
 335 }
 336
 337 static void
 338 binstailfree(buf_t *bp, struct bqueue *dp)
 339 {
 340
 341         KASSERT(mutex_owned(&bufcache_lock));
 342         KASSERT(bp->b_freelistindex == -1);
 343         TAILQ_INSERT_TAIL(&dp->bq_queue, bp, b_freelist);
 344         dp->bq_bytes += bp->b_bufsize;
 345         bp->b_freelistindex = dp - bufqueues;
 346 }
 347
 348 void
 349 bremfree(buf_t *bp)
 350 {
 351         struct bqueue *dp;
 352         int bqidx = bp->b_freelistindex;
 353
 354         KASSERT(mutex_owned(&bufcache_lock));
 355
 356         KASSERT(bqidx != -1);
 357         dp = &bufqueues[bqidx];
 358         KDASSERT(checkfreelist(bp, dp, 1));
 359         KASSERT(dp->bq_bytes >= bp->b_bufsize);
 360         TAILQ_REMOVE(&dp->bq_queue, bp, b_freelist);
 361         dp->bq_bytes -= bp->b_bufsize;
 362
 363         /* For the sysctl helper. */
 364         if (bp == dp->bq_marker)
 365                 dp->bq_marker = NULL;
 366
 367 #if defined(DIAGNOSTIC)
 368         bp->b_freelistindex = -1;
 369 #endif /* defined(DIAGNOSTIC) */
 370 }
 371
 372 /*
 373  * Add a reference to an buffer structure that came from buf_cache.
 374  */
 375 static inline void
 376 bref(buf_t *bp)
 377 {
 378
 379         KASSERT(mutex_owned(&bufcache_lock));
 380         KASSERT(bp->b_refcnt > 0);
 381
 382         bp->b_refcnt++;
 383 }
 384
 385 /*
 386  * Free an unused buffer structure that came from buf_cache.
 387  */
 388 static inline void
 389 brele(buf_t *bp)
 390 {
 391
 392         KASSERT(mutex_owned(&bufcache_lock));
 393         KASSERT(bp->b_refcnt > 0);
 394
 395         if (bp->b_refcnt-- == 1) {
 396                 buf_destroy(bp);
 397 #ifdef DEBUG
 398                 memset((char *)bp, 0, sizeof(*bp));
 399 #endif
 400                 pool_cache_put(buf_cache, bp);
 401         }
 402 }
 403
 404 /*
 405  * note that for some ports this is used by pmap bootstrap code to
 406  * determine kva size.
 407  */
 408 u_long
 409 buf_memcalc(void)
 410 {
 411         u_long n;
 412
 413         /*
 414          * Determine the upper bound of memory to use for buffers.
 415          *
 416          *      - If bufpages is specified, use that as the number
 417          *        pages.
 418          *
 419          *      - Otherwise, use bufcache as the percentage of
 420          *        physical memory.
 421          */
 422         if (bufpages != 0) {
 423                 n = bufpages;
 424         } else {
 425                 if (bufcache < 5) {
 426                         printf("forcing bufcache %d -> 5", bufcache);
 427                         bufcache = 5;
 428                 }
 429                 if (bufcache > 95) {
 430                         printf("forcing bufcache %d -> 95", bufcache);
 431                         bufcache = 95;
 432                 }
 433                 n = calc_cache_size(buf_map, bufcache,
 434                     (buf_map != kernel_map) ? 100 : BUFCACHE_VA_MAXPCT)
 435                     / PAGE_SIZE;
 436         }
 437
 438         n <<= PAGE_SHIFT;
 439         if (bufmem_valimit != 0 && n > bufmem_valimit)
 440                 n = bufmem_valimit;
 441
 442         return (n);
 443 }
 444
 445 /*
 446  * Initialize buffers and hash links for buffers.
 447  */
 448 void
 449 bufinit(void)
 450 {
 451         struct bqueue *dp;
 452         int use_std;
 453         u_int i;
 454
 455         mutex_init(&bufcache_lock, MUTEX_DEFAULT, IPL_NONE);
 456         mutex_init(&buffer_lock, MUTEX_DEFAULT, IPL_NONE);
 457         cv_init(&needbuffer_cv, "needbuf");
 458
 459         if (bufmem_valimit != 0) {
 460                 vaddr_t minaddr = 0, maxaddr;
 461                 buf_map = uvm_km_suballoc(kernel_map, &minaddr, &maxaddr,
 462                                           bufmem_valimit, 0, false, 0);
 463                 if (buf_map == NULL)
 464                         panic("bufinit: cannot allocate submap");
 465         } else
 466                 buf_map = kernel_map;
 467
 468         /*
 469          * Initialize buffer cache memory parameters.
 470          */
 471         bufmem = 0;
 472         buf_setwm();
 473
 474         /* On "small" machines use small pool page sizes where possible */
 475         use_std = (physmem < atop(16*1024*1024));
 476
 477         /*
 478          * Also use them on systems that can map the pool pages using
 479          * a direct-mapped segment.
 480          */
 481 #ifdef PMAP_MAP_POOLPAGE
 482         use_std = 1;
 483 #endif
 484
 485         buf_cache = pool_cache_init(sizeof(buf_t), 0, 0, 0,
 486             "bufpl", NULL, IPL_SOFTBIO, NULL, NULL, NULL);
 487         bufio_cache = pool_cache_init(sizeof(buf_t), 0, 0, 0,
 488             "biopl", NULL, IPL_BIO, NULL, NULL, NULL);
 489
 490         bufmempool_allocator.pa_backingmap = buf_map;
 491         for (i = 0; i < NMEMPOOLS; i++) {
 492                 struct pool_allocator *pa;
 493                 struct pool *pp = &bmempools[i];
 494                 u_int size = 1 << (i + MEMPOOL_INDEX_OFFSET);
 495                 char *name = kmem_alloc(8, KM_SLEEP); /* XXX: never freed */
 496                 if (__predict_true(size >= 1024))
 497                         (void)snprintf(name, 8, "buf%dk", size / 1024);
 498                 else
 499                         (void)snprintf(name, 8, "buf%db", size);
 500                 pa = (size <= PAGE_SIZE && use_std)
 501                         ? &pool_allocator_nointr
 502                         : &bufmempool_allocator;
 503                 pool_init(pp, size, 0, 0, 0, name, pa, IPL_NONE);
 504                 pool_setlowat(pp, 1);
 505                 pool_sethiwat(pp, 1);
 506         }
 507
 508         /* Initialize the buffer queues */
 509         for (dp = bufqueues; dp < &bufqueues[BQUEUES]; dp++) {
 510                 TAILQ_INIT(&dp->bq_queue);
 511                 dp->bq_bytes = 0;
 512         }
 513
 514         /*
 515          * Estimate hash table size based on the amount of memory we
 516          * intend to use for the buffer cache. The average buffer
 517          * size is dependent on our clients (i.e. filesystems).
 518          *
 519          * For now, use an empirical 3K per buffer.
 520          */
 521         nbuf = (bufmem_hiwater / 1024) / 3;
 522         bufhashtbl = hashinit(nbuf, HASH_LIST, true, &bufhash);
 523
 524         sysctl_kern_buf_setup();
 525         sysctl_vm_buf_setup();
 526 }
 527
 528 void
 529 bufinit2(void)
 530 {
 531
 532         biodone_sih = softint_establish(SOFTINT_BIO | SOFTINT_MPSAFE, biointr,
 533             NULL);
 534         if (biodone_sih == NULL)
 535                 panic("bufinit2: can't establish soft interrupt");
 536 }
 537
 538 static int
 539 buf_lotsfree(void)
 540 {
 541         int try, thresh;
 542
 543         /* Always allocate if less than the low water mark. */
 544         if (bufmem < bufmem_lowater)
 545                 return 1;
 546
 547         /* Never allocate if greater than the high water mark. */
 548         if (bufmem > bufmem_hiwater)
 549                 return 0;
 550
 551         /* If there's anything on the AGE list, it should be eaten. */
 552         if (TAILQ_FIRST(&bufqueues[BQ_AGE].bq_queue) != NULL)
 553                 return 0;
 554
 555         /*
 556          * The probabily of getting a new allocation is inversely
 557          * proportional to the current size of the cache, using
 558          * a granularity of 16 steps.
 559          */
 560         try = random() & 0x0000000fL;
 561
 562         /* Don't use "16 * bufmem" here to avoid a 32-bit overflow. */
 563         thresh = (bufmem - bufmem_lowater) /
 564             ((bufmem_hiwater - bufmem_lowater) / 16);
 565
 566         if (try >= thresh)
 567                 return 1;
 568
 569         /* Otherwise don't allocate. */
 570         return 0;
 571 }
 572
 573 /*
 574  * Return estimate of bytes we think need to be
 575  * released to help resolve low memory conditions.
 576  *
 577  * => called with bufcache_lock held.
 578  */
 579 static int
 580 buf_canrelease(void)
 581 {
 582         int pagedemand, ninvalid = 0;
 583
 584         KASSERT(mutex_owned(&bufcache_lock));
 585
 586         if (bufmem < bufmem_lowater)
 587                 return 0;
 588
 589         if (bufmem > bufmem_hiwater)
 590                 return bufmem - bufmem_hiwater;
 591
 592         ninvalid += bufqueues[BQ_AGE].bq_bytes;
 593
 594         pagedemand = uvmexp.freetarg - uvmexp.free;
 595         if (pagedemand < 0)
 596                 return ninvalid;
 597         return MAX(ninvalid, MIN(2 * MAXBSIZE,
 598             MIN((bufmem - bufmem_lowater) / 16, pagedemand * PAGE_SIZE)));
 599 }
 600
 601 /*
 602  * Buffer memory allocation helper functions
 603  */
 604 static u_long
 605 buf_mempoolidx(u_long size)
 606 {
 607         u_int n = 0;
 608
 609         size -= 1;
 610         size >>= MEMPOOL_INDEX_OFFSET;
 611         while (size) {
 612                 size >>= 1;
 613                 n += 1;
 614         }
 615         if (n >= NMEMPOOLS)
 616                 panic("buf mem pool index %d", n);
 617         return n;
 618 }
 619
 620 static u_long
 621 buf_roundsize(u_long size)
 622 {
 623         /* Round up to nearest power of 2 */
 624         return (1 << (buf_mempoolidx(size) + MEMPOOL_INDEX_OFFSET));
 625 }
 626
 627 static void *
 628 buf_malloc(size_t size)
 629 {
 630         u_int n = buf_mempoolidx(size);
 631         void *addr;
 632
 633         while (1) {
 634                 addr = pool_get(&bmempools[n], PR_NOWAIT);
 635                 if (addr != NULL)
 636                         break;
 637
 638                 /* No memory, see if we can free some. If so, try again */
 639                 mutex_enter(&bufcache_lock);
 640                 if (buf_drain(1) > 0) {
 641                         mutex_exit(&bufcache_lock);
 642                         continue;
 643                 }
 644
 645                 if (curlwp == uvm.pagedaemon_lwp) {
 646                         mutex_exit(&bufcache_lock);
 647                         return NULL;
 648                 }
 649
 650                 /* Wait for buffers to arrive on the LRU queue */
 651                 cv_timedwait(&needbuffer_cv, &bufcache_lock, hz / 4);
 652                 mutex_exit(&bufcache_lock);
 653         }
 654
 655         return addr;
 656 }
 657
 658 static void
 659 buf_mrelease(void *addr, size_t size)
 660 {
 661
 662         pool_put(&bmempools[buf_mempoolidx(size)], addr);
 663 }
 664
 665 /*
 666  * bread()/breadn() helper.
 667  */
 668 static buf_t *
 669 bio_doread(struct vnode *vp, daddr_t blkno, int size, kauth_cred_t cred,
 670     int async)
 671 {
 672         buf_t *bp;
 673         struct mount *mp;
 674
 675         bp = getblk(vp, blkno, size, 0, 0);
 676
 677 #ifdef DIAGNOSTIC
 678         if (bp == NULL) {
 679                 panic("bio_doread: no such buf");
 680         }
 681 #endif
 682
 683         /*
 684          * If buffer does not have data valid, start a read.
 685          * Note that if buffer is BC_INVAL, getblk() won't return it.
 686          * Therefore, it's valid if its I/O has completed or been delayed.
 687          */
 688         if (!ISSET(bp->b_oflags, (BO_DONE | BO_DELWRI))) {
 689                 /* Start I/O for the buffer. */
 690                 SET(bp->b_flags, B_READ | async);
 691                 if (async)
 692                         BIO_SETPRIO(bp, BPRIO_TIMELIMITED);
 693                 else
 694                         BIO_SETPRIO(bp, BPRIO_TIMECRITICAL);
 695                 VOP_STRATEGY(vp, bp);
 696
 697                 /* Pay for the read. */
 698                 curlwp->l_ru.ru_inblock++;
 699         } else if (async)
 700                 brelse(bp, 0);
 701
 702         if (vp->v_type == VBLK)
 703                 mp = vp->v_specmountpoint;
 704         else
 705                 mp = vp->v_mount;
 706
 707         /*
 708          * Collect statistics on synchronous and asynchronous reads.
 709          * Reads from block devices are charged to their associated
 710          * filesystem (if any).
 711          */
 712         if (mp != NULL) {
 713                 if (async == 0)
 714                         mp->mnt_stat.f_syncreads++;
 715                 else
 716                         mp->mnt_stat.f_asyncreads++;
 717         }
 718
 719         return (bp);
 720 }
 721
 722 /*
 723  * Read a disk block.
 724  * This algorithm described in Bach (p.54).
 725  */
 726 int
 727 bread(struct vnode *vp, daddr_t blkno, int size, kauth_cred_t cred,
 728     int flags, buf_t **bpp)
 729 {
 730         buf_t *bp;
 731         int error;
 732
 733         /* Get buffer for block. */
 734         bp = *bpp = bio_doread(vp, blkno, size, cred, 0);
 735
 736         /* Wait for the read to complete, and return result. */
 737         error = biowait(bp);
 738         if (error == 0 && (flags & B_MODIFY) != 0)
 739                 error = fscow_run(bp, true);
 740
 741         return error;
 742 }
 743
 744 /*
 745  * Read-ahead multiple disk blocks. The first is sync, the rest async.
 746  * Trivial modification to the breada algorithm presented in Bach (p.55).
 747  */
 748 int
 749 breadn(struct vnode *vp, daddr_t blkno, int size, daddr_t *rablks,
 750     int *rasizes, int nrablks, kauth_cred_t cred, int flags, buf_t **bpp)
 751 {
 752         buf_t *bp;
 753         int error, i;
 754
 755         bp = *bpp = bio_doread(vp, blkno, size, cred, 0);
 756
 757         /*
 758          * For each of the read-ahead blocks, start a read, if necessary.
 759          */
 760         mutex_enter(&bufcache_lock);
 761         for (i = 0; i < nrablks; i++) {
 762                 /* If it's in the cache, just go on to next one. */
 763                 if (incore(vp, rablks[i]))
 764                         continue;
 765
 766                 /* Get a buffer for the read-ahead block */
 767                 mutex_exit(&bufcache_lock);
 768                 (void) bio_doread(vp, rablks[i], rasizes[i], cred, B_ASYNC);
 769                 mutex_enter(&bufcache_lock);
 770         }
 771         mutex_exit(&bufcache_lock);
 772
 773         /* Otherwise, we had to start a read for it; wait until it's valid. */
 774         error = biowait(bp);
 775         if (error == 0 && (flags & B_MODIFY) != 0)
 776                 error = fscow_run(bp, true);
 777         return error;
 778 }
 779
 780 /*
 781  * Block write.  Described in Bach (p.56)
 782  */
 783 int
 784 bwrite(buf_t *bp)
 785 {
 786         int rv, sync, wasdelayed;
 787         struct vnode *vp;
 788         struct mount *mp;
 789
 790         KASSERT(ISSET(bp->b_cflags, BC_BUSY));
 791         KASSERT(!cv_has_waiters(&bp->b_done));
 792
 793         vp = bp->b_vp;
 794         if (vp != NULL) {
 795                 KASSERT(bp->b_objlock == &vp->v_interlock);
 796                 if (vp->v_type == VBLK)
 797                         mp = vp->v_specmountpoint;
 798                 else
 799                         mp = vp->v_mount;
 800         } else {
 801                 mp = NULL;
 802         }
 803
 804         if (mp && mp->mnt_wapbl) {
 805                 if (bp->b_iodone != mp->mnt_wapbl_op->wo_wapbl_biodone) {
 806                         bdwrite(bp);
 807                         return 0;
 808                 }
 809         }
 810
 811         /*
 812          * Remember buffer type, to switch on it later.  If the write was
 813          * synchronous, but the file system was mounted with MNT_ASYNC,
 814          * convert it to a delayed write.
 815          * XXX note that this relies on delayed tape writes being converted
 816          * to async, not sync writes (which is safe, but ugly).
 817          */
 818         sync = !ISSET(bp->b_flags, B_ASYNC);
 819         if (sync && mp != NULL && ISSET(mp->mnt_flag, MNT_ASYNC)) {
 820                 bdwrite(bp);
 821                 return (0);
 822         }
 823
 824         /*
 825          * Collect statistics on synchronous and asynchronous writes.
 826          * Writes to block devices are charged to their associated
 827          * filesystem (if any).
 828          */
 829         if (mp != NULL) {
 830                 if (sync)
 831                         mp->mnt_stat.f_syncwrites++;
 832                 else
 833                         mp->mnt_stat.f_asyncwrites++;
 834         }
 835
 836         /*
 837          * Pay for the I/O operation and make sure the buf is on the correct
 838          * vnode queue.
 839          */
 840         bp->b_error = 0;
 841         wasdelayed = ISSET(bp->b_oflags, BO_DELWRI);
 842         CLR(bp->b_flags, B_READ);
 843         if (wasdelayed) {
 844                 mutex_enter(&bufcache_lock);
 845                 mutex_enter(bp->b_objlock);
 846                 CLR(bp->b_oflags, BO_DONE | BO_DELWRI);
 847                 reassignbuf(bp, bp->b_vp);
 848                 mutex_exit(&bufcache_lock);
 849         } else {
 850                 curlwp->l_ru.ru_oublock++;
 851                 mutex_enter(bp->b_objlock);
 852                 CLR(bp->b_oflags, BO_DONE | BO_DELWRI);
 853         }
 854         if (vp != NULL)
 855                 vp->v_numoutput++;
 856         mutex_exit(bp->b_objlock);
 857
 858         /* Initiate disk write. */
 859         if (sync)
 860                 BIO_SETPRIO(bp, BPRIO_TIMECRITICAL);
 861         else
 862                 BIO_SETPRIO(bp, BPRIO_TIMELIMITED);
 863
 864         VOP_STRATEGY(vp, bp);
 865
 866         if (sync) {
 867                 /* If I/O was synchronous, wait for it to complete. */
 868                 rv = biowait(bp);
 869
 870                 /* Release the buffer. */
 871                 brelse(bp, 0);
 872
 873                 return (rv);
 874         } else {
 875                 return (0);
 876         }
 877 }
 878
 879 int
 880 vn_bwrite(void *v)
 881 {
 882         struct vop_bwrite_args *ap = v;
 883
 884         return (bwrite(ap->a_bp));
 885 }
 886
 887 /*
 888  * Delayed write.
 889  *
 890  * The buffer is marked dirty, but is not queued for I/O.
 891  * This routine should be used when the buffer is expected
 892  * to be modified again soon, typically a small write that
 893  * partially fills a buffer.
 894  *
 895  * NB: magnetic tapes cannot be delayed; they must be
 896  * written in the order that the writes are requested.
 897  *
 898  * Described in Leffler, et al. (pp. 208-213).
 899  */
 900 void
 901 bdwrite(buf_t *bp)
 902 {
 903
 904         KASSERT(bp->b_vp == NULL || bp->b_vp->v_tag != VT_UFS ||
 905             bp->b_vp->v_type == VBLK || ISSET(bp->b_flags, B_COWDONE));
 906         KASSERT(ISSET(bp->b_cflags, BC_BUSY));
 907         KASSERT(!cv_has_waiters(&bp->b_done));
 908
 909         /* If this is a tape block, write the block now. */
 910         if (bdev_type(bp->b_dev) == D_TAPE) {
 911                 bawrite(bp);
 912                 return;
 913         }
 914
 915         if (wapbl_vphaswapbl(bp->b_vp)) {
 916                 struct mount *mp = wapbl_vptomp(bp->b_vp);
 917
 918                 if (bp->b_iodone != mp->mnt_wapbl_op->wo_wapbl_biodone) {
 919                         WAPBL_ADD_BUF(mp, bp);
 920                 }
 921         }
 922
 923         /*
 924          * If the block hasn't been seen before:
 925          *      (1) Mark it as having been seen,
 926          *      (2) Charge for the write,
 927          *      (3) Make sure it's on its vnode's correct block list.
 928          */
 929         KASSERT(bp->b_vp == NULL || bp->b_objlock == &bp->b_vp->v_interlock);
 930
 931         if (!ISSET(bp->b_oflags, BO_DELWRI)) {
 932                 mutex_enter(&bufcache_lock);
 933                 mutex_enter(bp->b_objlock);
 934                 SET(bp->b_oflags, BO_DELWRI);
 935                 curlwp->l_ru.ru_oublock++;
 936                 reassignbuf(bp, bp->b_vp);
 937                 mutex_exit(&bufcache_lock);
 938         } else {
 939                 mutex_enter(bp->b_objlock);
 940         }
 941         /* Otherwise, the "write" is done, so mark and release the buffer. */
 942         CLR(bp->b_oflags, BO_DONE);
 943         mutex_exit(bp->b_objlock);
 944
 945         brelse(bp, 0);
 946 }
 947
 948 /*
 949  * Asynchronous block write; just an asynchronous bwrite().
 950  */
 951 void
 952 bawrite(buf_t *bp)
 953 {
 954
 955         KASSERT(ISSET(bp->b_cflags, BC_BUSY));
 956
 957         SET(bp->b_flags, B_ASYNC);
 958         VOP_BWRITE(bp);
 959 }
 960
 961 /*
 962  * Release a buffer on to the free lists.
 963  * Described in Bach (p. 46).
 964  */
 965 void
 966 brelsel(buf_t *bp, int set)
 967 {
 968         struct bqueue *bufq;
 969         struct vnode *vp;
 970
 971         KASSERT(mutex_owned(&bufcache_lock));
 972         KASSERT(!cv_has_waiters(&bp->b_done));
 973         KASSERT(bp->b_refcnt > 0);
 974
 975         SET(bp->b_cflags, set);
 976
 977         KASSERT(ISSET(bp->b_cflags, BC_BUSY));
 978         KASSERT(bp->b_iodone == NULL);
 979
 980         /* Wake up any processes waiting for any buffer to become free. */
 981         cv_signal(&needbuffer_cv);
 982
 983         /* Wake up any proceeses waiting for _this_ buffer to become */
 984         if (ISSET(bp->b_cflags, BC_WANTED))
 985                 CLR(bp->b_cflags, BC_WANTED|BC_AGE);
 986
 987         /*
 988          * Determine which queue the buffer should be on, then put it there.
 989          */
 990
 991         /* If it's locked, don't report an error; try again later. */
 992         if (ISSET(bp->b_flags, B_LOCKED))
 993                 bp->b_error = 0;
 994
 995         /* If it's not cacheable, or an error, mark it invalid. */
 996         if (ISSET(bp->b_cflags, BC_NOCACHE) || bp->b_error != 0)
 997                 SET(bp->b_cflags, BC_INVAL);
 998
 999         if (ISSET(bp->b_cflags, BC_VFLUSH)) {
1000                 /*
1001                  * This is a delayed write buffer that was just flushed to
1002                  * disk.  It is still on the LRU queue.  If it's become
1003                  * invalid, then we need to move it to a different queue;
1004                  * otherwise leave it in its current position.
1005                  */
1006                 CLR(bp->b_cflags, BC_VFLUSH);
1007                 if (!ISSET(bp->b_cflags, BC_INVAL|BC_AGE) &&
1008                     !ISSET(bp->b_flags, B_LOCKED) && bp->b_error == 0) {
1009                         KDASSERT(checkfreelist(bp, &bufqueues[BQ_LRU], 1));
1010                         goto already_queued;
1011                 } else {
1012                         bremfree(bp);
1013                 }
1014         }
1015
1016         KDASSERT(checkfreelist(bp, &bufqueues[BQ_AGE], 0));
1017         KDASSERT(checkfreelist(bp, &bufqueues[BQ_LRU], 0));
1018         KDASSERT(checkfreelist(bp, &bufqueues[BQ_LOCKED], 0));
1019
1020         if ((bp->b_bufsize <= 0) || ISSET(bp->b_cflags, BC_INVAL)) {
1021                 /*
1022                  * If it's invalid or empty, dissociate it from its vnode
1023                  * and put on the head of the appropriate queue.
1024                  */
1025                 if (ISSET(bp->b_flags, B_LOCKED)) {
1026                         if (wapbl_vphaswapbl(vp = bp->b_vp)) {
1027                                 struct mount *mp = wapbl_vptomp(vp);
1028
1029                                 KASSERT(bp->b_iodone
1030                                     != mp->mnt_wapbl_op->wo_wapbl_biodone);
1031                                 WAPBL_REMOVE_BUF(mp, bp);
1032                         }
1033                 }
1034
1035                 mutex_enter(bp->b_objlock);
1036                 CLR(bp->b_oflags, BO_DONE|BO_DELWRI);
1037                 if ((vp = bp->b_vp) != NULL) {
1038                         KASSERT(bp->b_objlock == &vp->v_interlock);
1039                         reassignbuf(bp, bp->b_vp);
1040                         brelvp(bp);
1041                         mutex_exit(&vp->v_interlock);
1042                 } else {
1043                         KASSERT(bp->b_objlock == &buffer_lock);
1044                         mutex_exit(bp->b_objlock);
1045                 }
1046
1047                 if (bp->b_bufsize <= 0)
1048                         /* no data */
1049                         goto already_queued;
1050                 else
1051                         /* invalid data */
1052                         bufq = &bufqueues[BQ_AGE];
1053                 binsheadfree(bp, bufq);
1054         } else  {
1055                 /*
1056                  * It has valid data.  Put it on the end of the appropriate
1057                  * queue, so that it'll stick around for as long as possible.
1058                  * If buf is AGE, but has dependencies, must put it on last
1059                  * bufqueue to be scanned, ie LRU. This protects against the
1060                  * livelock where BQ_AGE only has buffers with dependencies,
1061                  * and we thus never get to the dependent buffers in BQ_LRU.
1062                  */
1063                 if (ISSET(bp->b_flags, B_LOCKED)) {
1064                         /* locked in core */
1065                         bufq = &bufqueues[BQ_LOCKED];
1066                 } else if (!ISSET(bp->b_cflags, BC_AGE)) {
1067                         /* valid data */
1068                         bufq = &bufqueues[BQ_LRU];
1069                 } else {
1070                         /* stale but valid data */
1071                         bufq = &bufqueues[BQ_AGE];
1072                 }
1073                 binstailfree(bp, bufq);
1074         }
1075 already_queued:
1076         /* Unlock the buffer. */
1077         CLR(bp->b_cflags, BC_AGE|BC_BUSY|BC_NOCACHE);
1078         CLR(bp->b_flags, B_ASYNC);
1079         cv_broadcast(&bp->b_busy);
1080
1081         if (bp->b_bufsize <= 0)
1082                 brele(bp);
1083 }
1084
1085 void
1086 brelse(buf_t *bp, int set)
1087 {
1088
1089         mutex_enter(&bufcache_lock);
1090         brelsel(bp, set);
1091         mutex_exit(&bufcache_lock);
1092 }
1093
1094 /*
1095  * Determine if a block is in the cache.
1096  * Just look on what would be its hash chain.  If it's there, return
1097  * a pointer to it, unless it's marked invalid.  If it's marked invalid,
1098  * we normally don't return the buffer, unless the caller explicitly
1099  * wants us to.
1100  */
1101 buf_t *
1102 incore(struct vnode *vp, daddr_t blkno)
1103 {
1104         buf_t *bp;
1105
1106         KASSERT(mutex_owned(&bufcache_lock));
1107
1108         /* Search hash chain */
1109         LIST_FOREACH(bp, BUFHASH(vp, blkno), b_hash) {
1110                 if (bp->b_lblkno == blkno && bp->b_vp == vp &&
1111                     !ISSET(bp->b_cflags, BC_INVAL)) {
1112                         KASSERT(bp->b_objlock == &vp->v_interlock);
1113                         return (bp);
1114                 }
1115         }
1116
1117         return (NULL);
1118 }
1119
1120 /*
1121  * Get a block of requested size that is associated with
1122  * a given vnode and block offset. If it is found in the
1123  * block cache, mark it as having been found, make it busy
1124  * and return it. Otherwise, return an empty block of the
1125  * correct size. It is up to the caller to insure that the
1126  * cached blocks be of the correct size.
1127  */
1128 buf_t *
1129 getblk(struct vnode *vp, daddr_t blkno, int size, int slpflag, int slptimeo)
1130 {
1131         int err, preserve;
1132         buf_t *bp;
1133
1134         mutex_enter(&bufcache_lock);
1135  loop:
1136         bp = incore(vp, blkno);
1137         if (bp != NULL) {
1138                 err = bbusy(bp, ((slpflag & PCATCH) != 0), slptimeo, NULL);
1139                 if (err != 0) {
1140                         if (err == EPASSTHROUGH)
1141                                 goto loop;
1142                         mutex_exit(&bufcache_lock);
1143                         return (NULL);
1144                 }
1145                 KASSERT(!cv_has_waiters(&bp->b_done));
1146 #ifdef DIAGNOSTIC
1147                 if (ISSET(bp->b_oflags, BO_DONE|BO_DELWRI) &&
1148                     bp->b_bcount < size && vp->v_type != VBLK)
1149                         panic("getblk: block size invariant failed");
1150 #endif
1151                 bremfree(bp);
1152                 preserve = 1;
1153         } else {
1154                 if ((bp = getnewbuf(slpflag, slptimeo, 0)) == NULL)
1155                         goto loop;
1156
1157                 if (incore(vp, blkno) != NULL) {
1158                         /* The block has come into memory in the meantime. */
1159                         brelsel(bp, 0);
1160                         goto loop;
1161                 }
1162
1163                 LIST_INSERT_HEAD(BUFHASH(vp, blkno), bp, b_hash);
1164                 bp->b_blkno = bp->b_lblkno = bp->b_rawblkno = blkno;
1165                 mutex_enter(&vp->v_interlock);
1166                 bgetvp(vp, bp);
1167                 mutex_exit(&vp->v_interlock);
1168                 preserve = 0;
1169         }
1170         mutex_exit(&bufcache_lock);
1171
1172         /*
1173          * LFS can't track total size of B_LOCKED buffer (locked_queue_bytes)
1174          * if we re-size buffers here.
1175          */
1176         if (ISSET(bp->b_flags, B_LOCKED)) {
1177                 KASSERT(bp->b_bufsize >= size);
1178         } else {
1179                 if (allocbuf(bp, size, preserve)) {
1180                         mutex_enter(&bufcache_lock);
1181                         LIST_REMOVE(bp, b_hash);
1182                         mutex_exit(&bufcache_lock);
1183                         brelse(bp, BC_INVAL);
1184                         return NULL;
1185                 }
1186         }
1187         BIO_SETPRIO(bp, BPRIO_DEFAULT);
1188         return (bp);
1189 }
1190
1191 /*
1192  * Get an empty, disassociated buffer of given size.
1193  */
1194 buf_t *
1195 geteblk(int size)
1196 {
1197         buf_t *bp;
1198         int error;
1199
1200         mutex_enter(&bufcache_lock);
1201         while ((bp = getnewbuf(0, 0, 0)) == NULL)
1202                 ;
1203
1204         SET(bp->b_cflags, BC_INVAL);
1205         LIST_INSERT_HEAD(&invalhash, bp, b_hash);
1206         mutex_exit(&bufcache_lock);
1207         BIO_SETPRIO(bp, BPRIO_DEFAULT);
1208         error = allocbuf(bp, size, 0);
1209         KASSERT(error == 0);
1210         return (bp);
1211 }
1212
1213 /*
1214  * Expand or contract the actual memory allocated to a buffer.
1215  *
1216  * If the buffer shrinks, data is lost, so it's up to the
1217  * caller to have written it out *first*; this routine will not
1218  * start a write.  If the buffer grows, it's the callers
1219  * responsibility to fill out the buffer's additional contents.
1220  */
1221 int
1222 allocbuf(buf_t *bp, int size, int preserve)
1223 {
1224         void *addr;
1225         vsize_t oldsize, desired_size;
1226         int oldcount;
1227         int delta;
1228
1229         desired_size = buf_roundsize(size);
1230         if (desired_size > MAXBSIZE)
1231                 printf("allocbuf: buffer larger than MAXBSIZE requested");
1232
1233         oldcount = bp->b_bcount;
1234
1235         bp->b_bcount = size;
1236
1237         oldsize = bp->b_bufsize;
1238         if (oldsize == desired_size) {
1239                 /*
1240                  * Do not short cut the WAPBL resize, as the buffer length
1241                  * could still have changed and this would corrupt the
1242                  * tracking of the transaction length.
1243                  */
1244                 goto out;
1245         }
1246
1247         /*
1248          * If we want a buffer of a different size, re-allocate the
1249          * buffer's memory; copy old content only if needed.
1250          */
1251         addr = buf_malloc(desired_size);
1252         if (addr == NULL)
1253                 return ENOMEM;
1254         if (preserve)
1255                 memcpy(addr, bp->b_data, MIN(oldsize,desired_size));
1256         if (bp->b_data != NULL)
1257                 buf_mrelease(bp->b_data, oldsize);
1258         bp->b_data = addr;
1259         bp->b_bufsize = desired_size;
1260
1261         /*
1262          * Update overall buffer memory counter (protected by bufcache_lock)
1263          */
1264         delta = (long)desired_size - (long)oldsize;
1265
1266         mutex_enter(&bufcache_lock);
1267         if ((bufmem += delta) > bufmem_hiwater) {
1268                 /*
1269                  * Need to trim overall memory usage.
1270                  */
1271                 while (buf_canrelease()) {
1272                         if (curcpu()->ci_schedstate.spc_flags &
1273                             SPCF_SHOULDYIELD) {
1274                                 mutex_exit(&bufcache_lock);
1275                                 preempt();
1276                                 mutex_enter(&bufcache_lock);
1277                         }
1278                         if (buf_trim() == 0)
1279                                 break;
1280                 }
1281         }
1282         mutex_exit(&bufcache_lock);
1283
1284  out:
1285         if (wapbl_vphaswapbl(bp->b_vp))
1286                 WAPBL_RESIZE_BUF(wapbl_vptomp(bp->b_vp), bp, oldsize, oldcount);
1287
1288         return 0;
1289 }
1290
1291 /*
1292  * Find a buffer which is available for use.
1293  * Select something from a free list.
1294  * Preference is to AGE list, then LRU list.
1295  *
1296  * Called with the buffer queues locked.
1297  * Return buffer locked.
1298  */
1299 buf_t *
1300 getnewbuf(int slpflag, int slptimeo, int from_bufq)
1301 {
1302         buf_t *bp;
1303         struct vnode *vp;
1304
1305  start:
1306         KASSERT(mutex_owned(&bufcache_lock));
1307
1308         /*
1309          * Get a new buffer from the pool.
1310          */
1311         if (!from_bufq && buf_lotsfree()) {
1312                 mutex_exit(&bufcache_lock);
1313                 bp = pool_cache_get(buf_cache, PR_NOWAIT);
1314                 if (bp != NULL) {
1315                         memset((char *)bp, 0, sizeof(*bp));
1316                         buf_init(bp);
1317                         SET(bp->b_cflags, BC_BUSY);     /* mark buffer busy */
1318                         mutex_enter(&bufcache_lock);
1319 #if defined(DIAGNOSTIC)
1320                         bp->b_freelistindex = -1;
1321 #endif /* defined(DIAGNOSTIC) */
1322                         return (bp);
1323                 }
1324                 mutex_enter(&bufcache_lock);
1325         }
1326
1327         KASSERT(mutex_owned(&bufcache_lock));
1328         if ((bp = TAILQ_FIRST(&bufqueues[BQ_AGE].bq_queue)) != NULL ||
1329             (bp = TAILQ_FIRST(&bufqueues[BQ_LRU].bq_queue)) != NULL) {
1330                 KASSERT(!ISSET(bp->b_cflags, BC_BUSY) || ISSET(bp->b_cflags, BC_VFLUSH));
1331                 bremfree(bp);
1332
1333                 /* Buffer is no longer on free lists. */
1334                 SET(bp->b_cflags, BC_BUSY);
1335         } else {
1336                 /*
1337                  * XXX: !from_bufq should be removed.
1338                  */
1339                 if (!from_bufq || curlwp != uvm.pagedaemon_lwp) {
1340                         /* wait for a free buffer of any kind */
1341                         if ((slpflag & PCATCH) != 0)
1342                                 (void)cv_timedwait_sig(&needbuffer_cv,
1343                                     &bufcache_lock, slptimeo);
1344                         else
1345                                 (void)cv_timedwait(&needbuffer_cv,
1346                                     &bufcache_lock, slptimeo);
1347                 }
1348                 return (NULL);
1349         }
1350
1351 #ifdef DIAGNOSTIC
1352         if (bp->b_bufsize <= 0)
1353                 panic("buffer %p: on queue but empty", bp);
1354 #endif
1355
1356         if (ISSET(bp->b_cflags, BC_VFLUSH)) {
1357                 /*
1358                  * This is a delayed write buffer being flushed to disk.  Make
1359                  * sure it gets aged out of the queue when it's finished, and
1360                  * leave it off the LRU queue.
1361                  */
1362                 CLR(bp->b_cflags, BC_VFLUSH);
1363                 SET(bp->b_cflags, BC_AGE);
1364                 goto start;
1365         }
1366
1367         KASSERT(ISSET(bp->b_cflags, BC_BUSY));
1368         KASSERT(bp->b_refcnt > 0);
1369         KASSERT(!cv_has_waiters(&bp->b_done));
1370
1371         /*
1372          * If buffer was a delayed write, start it and return NULL
1373          * (since we might sleep while starting the write).
1374          */
1375         if (ISSET(bp->b_oflags, BO_DELWRI)) {
1376                 /*
1377                  * This buffer has gone through the LRU, so make sure it gets
1378                  * reused ASAP.
1379                  */
1380                 SET(bp->b_cflags, BC_AGE);
1381                 mutex_exit(&bufcache_lock);
1382                 bawrite(bp);
1383                 mutex_enter(&bufcache_lock);
1384                 return (NULL);
1385         }
1386
1387         vp = bp->b_vp;
1388
1389         /* clear out various other fields */
1390         bp->b_cflags = BC_BUSY;
1391         bp->b_oflags = 0;
1392         bp->b_flags = 0;
1393         bp->b_dev = NODEV;
1394         bp->b_blkno = 0;
1395         bp->b_lblkno = 0;
1396         bp->b_rawblkno = 0;
1397         bp->b_iodone = 0;
1398         bp->b_error = 0;
1399         bp->b_resid = 0;
1400         bp->b_bcount = 0;
1401
1402         LIST_REMOVE(bp, b_hash);
1403
1404         /* Disassociate us from our vnode, if we had one... */
1405         if (vp != NULL) {
1406                 mutex_enter(&vp->v_interlock);
1407                 brelvp(bp);
1408                 mutex_exit(&vp->v_interlock);
1409         }
1410
1411         return (bp);
1412 }
1413
1414 /*
1415  * Attempt to free an aged buffer off the queues.
1416  * Called with queue lock held.
1417  * Returns the amount of buffer memory freed.
1418  */
1419 static int
1420 buf_trim(void)
1421 {
1422         buf_t *bp;
1423         long size = 0;
1424
1425         KASSERT(mutex_owned(&bufcache_lock));
1426
1427         /* Instruct getnewbuf() to get buffers off the queues */
1428         if ((bp = getnewbuf(PCATCH, 1, 1)) == NULL)
1429                 return 0;
1430
1431         KASSERT((bp->b_cflags & BC_WANTED) == 0);
1432         size = bp->b_bufsize;
1433         bufmem -= size;
1434         if (size > 0) {
1435                 buf_mrelease(bp->b_data, size);
1436                 bp->b_bcount = bp->b_bufsize = 0;
1437         }
1438         /* brelse() will return the buffer to the global buffer pool */
1439         brelsel(bp, 0);
1440         return size;
1441 }
1442
1443 int
1444 buf_drain(int n)
1445 {
1446         int size = 0, sz;
1447
1448         KASSERT(mutex_owned(&bufcache_lock));
1449
1450         while (size < n && bufmem > bufmem_lowater) {
1451                 sz = buf_trim();
1452                 if (sz <= 0)
1453                         break;
1454                 size += sz;
1455         }
1456
1457         return size;
1458 }
1459
1460 /*
1461  * Wait for operations on the buffer to complete.
1462  * When they do, extract and return the I/O's error value.
1463  */
1464 int
1465 biowait(buf_t *bp)
1466 {
1467
1468         KASSERT(ISSET(bp->b_cflags, BC_BUSY));
1469         KASSERT(bp->b_refcnt > 0);
1470
1471         mutex_enter(bp->b_objlock);
1472         while (!ISSET(bp->b_oflags, BO_DONE | BO_DELWRI))
1473                 cv_wait(&bp->b_done, bp->b_objlock);
1474         mutex_exit(bp->b_objlock);
1475
1476         return bp->b_error;
1477 }
1478
1479 /*
1480  * Mark I/O complete on a buffer.
1481  *
1482  * If a callback has been requested, e.g. the pageout
1483  * daemon, do so. Otherwise, awaken waiting processes.
1484  *
1485  * [ Leffler, et al., says on p.247:
1486  *      "This routine wakes up the blocked process, frees the buffer
1487  *      for an asynchronous write, or, for a request by the pagedaemon
1488  *      process, invokes a procedure specified in the buffer structure" ]
1489  *
1490  * In real life, the pagedaemon (or other system processes) wants
1491  * to do async stuff to, and doesn't want the buffer brelse()'d.
1492  * (for swap pager, that puts swap buffers on the free lists (!!!),
1493  * for the vn device, that puts malloc'd buffers on the free lists!)
1494  */
1495 void
1496 biodone(buf_t *bp)
1497 {
1498         int s;
1499
1500         KASSERT(!ISSET(bp->b_oflags, BO_DONE));
1501
1502         if (cpu_intr_p()) {
1503                 /* From interrupt mode: defer to a soft interrupt. */
1504                 s = splvm();
1505                 TAILQ_INSERT_TAIL(&curcpu()->ci_data.cpu_biodone, bp, b_actq);
1506                 softint_schedule(biodone_sih);
1507                 splx(s);
1508         } else {
1509                 /* Process now - the buffer may be freed soon. */
1510                 biodone2(bp);
1511         }
1512 }
1513
1514 static void
1515 biodone2(buf_t *bp)
1516 {
1517         void (*callout)(buf_t *);
1518
1519         mutex_enter(bp->b_objlock);
1520         /* Note that the transfer is done. */
1521         if (ISSET(bp->b_oflags, BO_DONE))
1522                 panic("biodone2 already");
1523         CLR(bp->b_flags, B_COWDONE);
1524         SET(bp->b_oflags, BO_DONE);
1525         BIO_SETPRIO(bp, BPRIO_DEFAULT);
1526
1527         /* Wake up waiting writers. */
1528         if (!ISSET(bp->b_flags, B_READ))
1529                 vwakeup(bp);
1530
1531         if ((callout = bp->b_iodone) != NULL) {
1532                 /* Note callout done, then call out. */
1533                 KASSERT(!cv_has_waiters(&bp->b_done));
1534                 KERNEL_LOCK(1, NULL);           /* XXXSMP */
1535                 bp->b_iodone = NULL;
1536                 mutex_exit(bp->b_objlock);
1537                 (*callout)(bp);
1538                 KERNEL_UNLOCK_ONE(NULL);        /* XXXSMP */
1539         } else if (ISSET(bp->b_flags, B_ASYNC)) {
1540                 /* If async, release. */
1541                 KASSERT(!cv_has_waiters(&bp->b_done));
1542                 mutex_exit(bp->b_objlock);
1543                 brelse(bp, 0);
1544         } else {
1545                 /* Otherwise just wake up waiters in biowait(). */
1546                 cv_broadcast(&bp->b_done);
1547                 mutex_exit(bp->b_objlock);
1548         }
1549 }
1550
1551 static void
1552 biointr(void *cookie)
1553 {
1554         struct cpu_info *ci;
1555         buf_t *bp;
1556         int s;
1557
1558         ci = curcpu();
1559
1560         while (!TAILQ_EMPTY(&ci->ci_data.cpu_biodone)) {
1561                 KASSERT(curcpu() == ci);
1562
1563                 s = splvm();
1564                 bp = TAILQ_FIRST(&ci->ci_data.cpu_biodone);
1565                 TAILQ_REMOVE(&ci->ci_data.cpu_biodone, bp, b_actq);
1566                 splx(s);
1567
1568                 biodone2(bp);
1569         }
1570 }
1571
1572 /*
1573  * Return a count of buffers on the "locked" queue.
1574  */
1575 int
1576 count_lock_queue(void)
1577 {
1578         buf_t *bp;
1579         int n = 0;
1580
1581         mutex_enter(&bufcache_lock);
1582         TAILQ_FOREACH(bp, &bufqueues[BQ_LOCKED].bq_queue, b_freelist)
1583                 n++;
1584         mutex_exit(&bufcache_lock);
1585         return (n);
1586 }
1587
1588 /*
1589  * Wait for all buffers to complete I/O
1590  * Return the number of "stuck" buffers.
1591  */
1592 int
1593 buf_syncwait(void)
1594 {
1595         buf_t *bp;
1596         int iter, nbusy, nbusy_prev = 0, dcount, ihash;
1597
1598         dcount = 10000;
1599         for (iter = 0; iter < 20;) {
1600                 mutex_enter(&bufcache_lock);
1601                 nbusy = 0;
1602                 for (ihash = 0; ihash < bufhash+1; ihash++) {
1603                     LIST_FOREACH(bp, &bufhashtbl[ihash], b_hash) {
1604                         if ((bp->b_cflags & (BC_BUSY|BC_INVAL)) == BC_BUSY)
1605                                 nbusy += ((bp->b_flags & B_READ) == 0);
1606                     }
1607                 }
1608                 mutex_exit(&bufcache_lock);
1609
1610                 if (nbusy == 0)
1611                         break;
1612                 if (nbusy_prev == 0)
1613                         nbusy_prev = nbusy;
1614                 printf("%d ", nbusy);
1615                 kpause("bflush", false, (iter == 0) ? 1 : hz / 25 * iter, NULL);
1616                 if (nbusy >= nbusy_prev) /* we didn't flush anything */
1617                         iter++;
1618                 else
1619                         nbusy_prev = nbusy;
1620         }
1621
1622         if (nbusy) {
1623 #if defined(DEBUG) || defined(DEBUG_HALT_BUSY)
1624                 printf("giving up\nPrinting vnodes for busy buffers\n");
1625                 for (ihash = 0; ihash < bufhash+1; ihash++) {
1626                     LIST_FOREACH(bp, &bufhashtbl[ihash], b_hash) {
1627                         if ((bp->b_cflags & (BC_BUSY|BC_INVAL)) == BC_BUSY &&
1628                             (bp->b_flags & B_READ) == 0)
1629                                 vprint(NULL, bp->b_vp);
1630                     }
1631                 }
1632 #endif
1633         }
1634
1635         return nbusy;
1636 }
1637
1638 static void
1639 sysctl_fillbuf(buf_t *i, struct buf_sysctl *o)
1640 {
1641
1642         o->b_flags = i->b_flags | i->b_cflags | i->b_oflags;
1643         o->b_error = i->b_error;
1644         o->b_prio = i->b_prio;
1645         o->b_dev = i->b_dev;
1646         o->b_bufsize = i->b_bufsize;
1647         o->b_bcount = i->b_bcount;
1648         o->b_resid = i->b_resid;
1649         o->b_addr = PTRTOUINT64(i->b_data);
1650         o->b_blkno = i->b_blkno;
1651         o->b_rawblkno = i->b_rawblkno;
1652         o->b_iodone = PTRTOUINT64(i->b_iodone);
1653         o->b_proc = PTRTOUINT64(i->b_proc);
1654         o->b_vp = PTRTOUINT64(i->b_vp);
1655         o->b_saveaddr = PTRTOUINT64(i->b_saveaddr);
1656         o->b_lblkno = i->b_lblkno;
1657 }
1658
1659 #define KERN_BUFSLOP 20
1660 static int
1661 sysctl_dobuf(SYSCTLFN_ARGS)
1662 {
1663         buf_t *bp;
1664         struct buf_sysctl bs;
1665         struct bqueue *bq;
1666         char *dp;
1667         u_int i, op, arg;
1668         size_t len, needed, elem_size, out_size;
1669         int error, elem_count, retries;
1670
1671         if (namelen == 1 && name[0] == CTL_QUERY)
1672                 return (sysctl_query(SYSCTLFN_CALL(rnode)));
1673
1674         if (namelen != 4)
1675                 return (EINVAL);
1676
1677         retries = 100;
1678  retry:
1679         dp = oldp;
1680         len = (oldp != NULL) ? *oldlenp : 0;
1681         op = name[0];
1682         arg = name[1];
1683         elem_size = name[2];
1684         elem_count = name[3];
1685         out_size = MIN(sizeof(bs), elem_size);
1686
1687         /*
1688          * at the moment, these are just "placeholders" to make the
1689          * API for retrieving kern.buf data more extensible in the
1690          * future.
1691          *
1692          * XXX kern.buf currently has "netbsd32" issues.  hopefully
1693          * these will be resolved at a later point.
1694          */
1695         if (op != KERN_BUF_ALL || arg != KERN_BUF_ALL ||
1696             elem_size < 1 || elem_count < 0)
1697                 return (EINVAL);
1698
1699         error = 0;
1700         needed = 0;
1701         sysctl_unlock();
1702         mutex_enter(&bufcache_lock);
1703         for (i = 0; i < BQUEUES; i++) {
1704                 bq = &bufqueues[i];
1705                 TAILQ_FOREACH(bp, &bq->bq_queue, b_freelist) {
1706                         bq->bq_marker = bp;
1707                         if (len >= elem_size && elem_count > 0) {
1708                                 sysctl_fillbuf(bp, &bs);
1709                                 mutex_exit(&bufcache_lock);
1710                                 error = copyout(&bs, dp, out_size);
1711                                 mutex_enter(&bufcache_lock);
1712                                 if (error)
1713                                         break;
1714                                 if (bq->bq_marker != bp) {
1715                                         /*
1716                                          * This sysctl node is only for
1717                                          * statistics.  Retry; if the
1718                                          * queue keeps changing, then
1719                                          * bail out.
1720                                          */
1721                                         if (retries-- == 0) {
1722                                                 error = EAGAIN;
1723                                                 break;
1724                                         }
1725                                         mutex_exit(&bufcache_lock);
1726                                         goto retry;
1727                                 }
1728                                 dp += elem_size;
1729                                 len -= elem_size;
1730                         }
1731                         needed += elem_size;
1732                         if (elem_count > 0 && elem_count != INT_MAX)
1733                                 elem_count--;
1734                 }
1735                 if (error != 0)
1736                         break;
1737         }
1738         mutex_exit(&bufcache_lock);
1739         sysctl_relock();
1740
1741         *oldlenp = needed;
1742         if (oldp == NULL)
1743                 *oldlenp += KERN_BUFSLOP * sizeof(buf_t);
1744
1745         return (error);
1746 }
1747
1748 static int
1749 sysctl_bufvm_update(SYSCTLFN_ARGS)
1750 {
1751         int t, error, rv;
1752         struct sysctlnode node;
1753
1754         node = *rnode;
1755         node.sysctl_data = &t;
1756         t = *(int *)rnode->sysctl_data;
1757         error = sysctl_lookup(SYSCTLFN_CALL(&node));
1758         if (error || newp == NULL)
1759                 return (error);
1760
1761         if (t < 0)
1762                 return EINVAL;
1763         if (rnode->sysctl_data == &bufcache) {
1764                 if (t > 100)
1765                         return (EINVAL);
1766                 bufcache = t;
1767                 buf_setwm();
1768         } else if (rnode->sysctl_data == &bufmem_lowater) {
1769                 if (bufmem_hiwater - t < 16)
1770                         return (EINVAL);
1771                 bufmem_lowater = t;
1772         } else if (rnode->sysctl_data == &bufmem_hiwater) {
1773                 if (t - bufmem_lowater < 16)
1774                         return (EINVAL);
1775                 bufmem_hiwater = t;
1776         } else
1777                 return (EINVAL);
1778
1779         /* Drain until below new high water mark */
1780         sysctl_unlock();
1781         mutex_enter(&bufcache_lock);
1782         while ((t = bufmem - bufmem_hiwater) >= 0) {
1783                 rv = buf_drain(t / (2 * 1024));
1784                 if (rv <= 0)
1785                         break;
1786         }
1787         mutex_exit(&bufcache_lock);
1788         sysctl_relock();
1789
1790         return 0;
1791 }
1792
1793 static struct sysctllog *vfsbio_sysctllog;
1794
1795 static void
1796 sysctl_kern_buf_setup(void)
1797 {
1798
1799         sysctl_createv(&vfsbio_sysctllog, 0, NULL, NULL,
1800                        CTLFLAG_PERMANENT,
1801                        CTLTYPE_NODE, "kern", NULL,
1802                        NULL, 0, NULL, 0,
1803                        CTL_KERN, CTL_EOL);
1804         sysctl_createv(&vfsbio_sysctllog, 0, NULL, NULL,
1805                        CTLFLAG_PERMANENT,
1806                        CTLTYPE_NODE, "buf",
1807                        SYSCTL_DESCR("Kernel buffer cache information"),
1808                        sysctl_dobuf, 0, NULL, 0,
1809                        CTL_KERN, KERN_BUF, CTL_EOL);
1810 }
1811
1812 static void
1813 sysctl_vm_buf_setup(void)
1814 {
1815
1816         sysctl_createv(&vfsbio_sysctllog, 0, NULL, NULL,
1817                        CTLFLAG_PERMANENT,
1818                        CTLTYPE_NODE, "vm", NULL,
1819                        NULL, 0, NULL, 0,
1820                        CTL_VM, CTL_EOL);
1821         sysctl_createv(&vfsbio_sysctllog, 0, NULL, NULL,
1822                        CTLFLAG_PERMANENT|CTLFLAG_READWRITE,
1823                        CTLTYPE_INT, "bufcache",
1824                        SYSCTL_DESCR("Percentage of physical memory to use for "
1825                                     "buffer cache"),
1826                        sysctl_bufvm_update, 0, &bufcache, 0,
1827                        CTL_VM, CTL_CREATE, CTL_EOL);
1828         sysctl_createv(&vfsbio_sysctllog, 0, NULL, NULL,
1829                        CTLFLAG_PERMANENT|CTLFLAG_READONLY,
1830                        CTLTYPE_INT, "bufmem",
1831                        SYSCTL_DESCR("Amount of kernel memory used by buffer "
1832                                     "cache"),
1833                        NULL, 0, &bufmem, 0,
1834                        CTL_VM, CTL_CREATE, CTL_EOL);
1835         sysctl_createv(&vfsbio_sysctllog, 0, NULL, NULL,
1836                        CTLFLAG_PERMANENT|CTLFLAG_READWRITE,
1837                        CTLTYPE_INT, "bufmem_lowater",
1838                        SYSCTL_DESCR("Minimum amount of kernel memory to "
1839                                     "reserve for buffer cache"),
1840                        sysctl_bufvm_update, 0, &bufmem_lowater, 0,
1841                        CTL_VM, CTL_CREATE, CTL_EOL);
1842         sysctl_createv(&vfsbio_sysctllog, 0, NULL, NULL,
1843                        CTLFLAG_PERMANENT|CTLFLAG_READWRITE,
1844                        CTLTYPE_INT, "bufmem_hiwater",
1845                        SYSCTL_DESCR("Maximum amount of kernel memory to use "
1846                                     "for buffer cache"),
1847                        sysctl_bufvm_update, 0, &bufmem_hiwater, 0,
1848                        CTL_VM, CTL_CREATE, CTL_EOL);
1849 }
1850
1851 #ifdef DEBUG
1852 /*
1853  * Print out statistics on the current allocation of the buffer pool.
1854  * Can be enabled to print out on every ``sync'' by setting "syncprt"
1855  * in vfs_syscalls.c using sysctl.
1856  */
1857 void
1858 vfs_bufstats(void)
1859 {
1860         int i, j, count;
1861         buf_t *bp;
1862         struct bqueue *dp;
1863         int counts[(MAXBSIZE / PAGE_SIZE) + 1];
1864         static const char *bname[BQUEUES] = { "LOCKED", "LRU", "AGE" };
1865
1866         for (dp = bufqueues, i = 0; dp < &bufqueues[BQUEUES]; dp++, i++) {
1867                 count = 0;
1868                 for (j = 0; j <= MAXBSIZE/PAGE_SIZE; j++)
1869                         counts[j] = 0;
1870                 TAILQ_FOREACH(bp, &dp->bq_queue, b_freelist) {
1871                         counts[bp->b_bufsize/PAGE_SIZE]++;
1872                         count++;
1873                 }
1874                 printf("%s: total-%d", bname[i], count);
1875                 for (j = 0; j <= MAXBSIZE/PAGE_SIZE; j++)
1876                         if (counts[j] != 0)
1877                                 printf(", %d-%d", j * PAGE_SIZE, counts[j]);
1878                 printf("\n");
1879         }
1880 }
1881 #endif /* DEBUG */
1882
1883 /* ------------------------------ */
1884
1885 buf_t *
1886 getiobuf(struct vnode *vp, bool waitok)
1887 {
1888         buf_t *bp;
1889
1890         bp = pool_cache_get(bufio_cache, (waitok ? PR_WAITOK : PR_NOWAIT));
1891         if (bp == NULL)
1892                 return bp;
1893
1894         buf_init(bp);
1895
1896         if ((bp->b_vp = vp) == NULL)
1897                 bp->b_objlock = &buffer_lock;
1898         else
1899                 bp->b_objlock = &vp->v_interlock;
1900
1901         return bp;
1902 }
1903
1904 void
1905 putiobuf(buf_t *bp)
1906 {
1907
1908         buf_destroy(bp);
1909         pool_cache_put(bufio_cache, bp);
1910 }
1911
1912 /*
1913  * nestiobuf_iodone: b_iodone callback for nested buffers.
1914  */
1915
1916 void
1917 nestiobuf_iodone(buf_t *bp)
1918 {
1919         buf_t *mbp = bp->b_private;
1920         int error;
1921         int donebytes;
1922
1923         KASSERT(bp->b_bcount <= bp->b_bufsize);
1924         KASSERT(mbp != bp);
1925
1926         error = bp->b_error;
1927         if (bp->b_error == 0 &&
1928             (bp->b_bcount < bp->b_bufsize || bp->b_resid > 0)) {
1929                 /*
1930                  * Not all got transfered, raise an error. We have no way to
1931                  * propagate these conditions to mbp.
1932                  */
1933                 error = EIO;
1934         }
1935
1936         donebytes = bp->b_bufsize;
1937
1938         putiobuf(bp);
1939         nestiobuf_done(mbp, donebytes, error);
1940 }
1941
1942 /*
1943  * nestiobuf_setup: setup a "nested" buffer.
1944  *
1945  * => 'mbp' is a "master" buffer which is being divided into sub pieces.
1946  * => 'bp' should be a buffer allocated by getiobuf.
1947  * => 'offset' is a byte offset in the master buffer.
1948  * => 'size' is a size in bytes of this nested buffer.
1949  */
1950
1951 void
1952 nestiobuf_setup(buf_t *mbp, buf_t *bp, int offset, size_t size)
1953 {
1954         const int b_read = mbp->b_flags & B_READ;
1955         struct vnode *vp = mbp->b_vp;
1956
1957         KASSERT(mbp->b_bcount >= offset + size);
1958         bp->b_vp = vp;
1959         bp->b_dev = mbp->b_dev;
1960         bp->b_objlock = mbp->b_objlock;
1961         bp->b_cflags = BC_BUSY;
1962         bp->b_flags = B_ASYNC | b_read;
1963         bp->b_iodone = nestiobuf_iodone;
1964         bp->b_data = (char *)mbp->b_data + offset;
1965         bp->b_resid = bp->b_bcount = size;
1966         bp->b_bufsize = bp->b_bcount;
1967         bp->b_private = mbp;
1968         BIO_COPYPRIO(bp, mbp);
1969         if (!b_read && vp != NULL) {
1970                 mutex_enter(&vp->v_interlock);
1971                 vp->v_numoutput++;
1972                 mutex_exit(&vp->v_interlock);
1973         }
1974 }
1975
1976 /*
1977  * nestiobuf_done: propagate completion to the master buffer.
1978  *
1979  * => 'donebytes' specifies how many bytes in the 'mbp' is completed.
1980  * => 'error' is an errno(2) that 'donebytes' has been completed with.
1981  */
1982
1983 void
1984 nestiobuf_done(buf_t *mbp, int donebytes, int error)
1985 {
1986
1987         if (donebytes == 0) {
1988                 return;
1989         }
1990         mutex_enter(mbp->b_objlock);
1991         KASSERT(mbp->b_resid >= donebytes);
1992         mbp->b_resid -= donebytes;
1993         if (error)
1994                 mbp->b_error = error;
1995         if (mbp->b_resid == 0) {
1996                 mutex_exit(mbp->b_objlock);
1997                 biodone(mbp);
1998         } else
1999                 mutex_exit(mbp->b_objlock);
2000 }
2001
2002 void
2003 buf_init(buf_t *bp)
2004 {
2005
2006         cv_init(&bp->b_busy, "biolock");
2007         cv_init(&bp->b_done, "biowait");
2008         bp->b_dev = NODEV;
2009         bp->b_error = 0;
2010         bp->b_flags = 0;
2011         bp->b_cflags = 0;
2012         bp->b_oflags = 0;
2013         bp->b_objlock = &buffer_lock;
2014         bp->b_iodone = NULL;
2015         bp->b_refcnt = 1;
2016         bp->b_dev = NODEV;
2017         bp->b_vnbufs.le_next = NOLIST;
2018         BIO_SETPRIO(bp, BPRIO_DEFAULT);
2019 }
2020
2021 void
2022 buf_destroy(buf_t *bp)
2023 {
2024
2025         cv_destroy(&bp->b_done);
2026         cv_destroy(&bp->b_busy);
2027 }
2028
2029 int
2030 bbusy(buf_t *bp, bool intr, int timo, kmutex_t *interlock)
2031 {
2032         int error;
2033
2034         KASSERT(mutex_owned(&bufcache_lock));
2035
2036         if ((bp->b_cflags & BC_BUSY) != 0) {
2037                 if (curlwp == uvm.pagedaemon_lwp)
2038                         return EDEADLK;
2039                 bp->b_cflags |= BC_WANTED;
2040                 bref(bp);
2041                 if (interlock != NULL)
2042                         mutex_exit(interlock);
2043                 if (intr) {
2044                         error = cv_timedwait_sig(&bp->b_busy, &bufcache_lock,
2045                             timo);
2046                 } else {
2047                         error = cv_timedwait(&bp->b_busy, &bufcache_lock,
2048                             timo);
2049                 }
2050                 brele(bp);
2051                 if (interlock != NULL)
2052                         mutex_enter(interlock);
2053                 if (error != 0)
2054                         return error;
2055                 return EPASSTHROUGH;
2056         }
2057         bp->b_cflags |= BC_BUSY;
2058
2059         return 0;
2060 }