1 /* $NetBSD: vfs_bio.c,v 1.221 2009/11/11 09:15:42 rmind Exp $ */
4 * Copyright (c) 2007, 2008, 2009 The NetBSD Foundation, Inc.
7 * This code is derived from software contributed to The NetBSD Foundation
8 * by Andrew Doran, and by Wasabi Systems, Inc.
10 * Redistribution and use in source and binary forms, with or without
11 * modification, are permitted provided that the following conditions
13 * 1. Redistributions of source code must retain the above copyright
14 * notice, this list of conditions and the following disclaimer.
15 * 2. Redistributions in binary form must reproduce the above copyright
16 * notice, this list of conditions and the following disclaimer in the
17 * documentation and/or other materials provided with the distribution.
19 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
20 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
21 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
22 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
23 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
24 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
25 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
26 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
27 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
28 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
29 * POSSIBILITY OF SUCH DAMAGE.
33 * Copyright (c) 1982, 1986, 1989, 1993
34 * The Regents of the University of California. All rights reserved.
35 * (c) UNIX System Laboratories, Inc.
36 * All or some portions of this file are derived from material licensed
37 * to the University of California by American Telephone and Telegraph
38 * Co. or Unix System Laboratories, Inc. and are reproduced herein with
39 * the permission of UNIX System Laboratories, Inc.
41 * Redistribution and use in source and binary forms, with or without
42 * modification, are permitted provided that the following conditions
44 * 1. Redistributions of source code must retain the above copyright
45 * notice, this list of conditions and the following disclaimer.
46 * 2. Redistributions in binary form must reproduce the above copyright
47 * notice, this list of conditions and the following disclaimer in the
48 * documentation and/or other materials provided with the distribution.
49 * 3. Neither the name of the University nor the names of its contributors
50 * may be used to endorse or promote products derived from this software
51 * without specific prior written permission.
53 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
54 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
55 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
56 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
57 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
58 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
59 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
60 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
61 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
62 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
65 * @(#)vfs_bio.c 8.6 (Berkeley) 1/11/94
69 * Copyright (c) 1994 Christopher G. Demetriou
71 * Redistribution and use in source and binary forms, with or without
72 * modification, are permitted provided that the following conditions
74 * 1. Redistributions of source code must retain the above copyright
75 * notice, this list of conditions and the following disclaimer.
76 * 2. Redistributions in binary form must reproduce the above copyright
77 * notice, this list of conditions and the following disclaimer in the
78 * documentation and/or other materials provided with the distribution.
79 * 3. All advertising materials mentioning features or use of this software
80 * must display the following acknowledgement:
81 * This product includes software developed by the University of
82 * California, Berkeley and its contributors.
83 * 4. Neither the name of the University nor the names of its contributors
84 * may be used to endorse or promote products derived from this software
85 * without specific prior written permission.
87 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
88 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
89 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
90 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
91 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
92 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
93 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
94 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
95 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
96 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
99 * @(#)vfs_bio.c 8.6 (Berkeley) 1/11/94
103 * The buffer cache subsystem.
106 * Bach: The Design of the UNIX Operating System (Prentice Hall, 1986)
107 * Leffler, et al.: The Design and Implementation of the 4.3BSD
108 * UNIX Operating System (Addison Welley, 1989)
112 * There are three locks:
113 * - bufcache_lock: protects global buffer cache state.
114 * - BC_BUSY: a long term per-buffer lock.
115 * - buf_t::b_objlock: lock on completion (biowait vs biodone).
117 * For buffers associated with vnodes (a most common case) b_objlock points
118 * to the vnode_t::v_interlock. Otherwise, it points to generic buffer_lock.
125 #include <sys/cdefs.h>
126 __KERNEL_RCSID(0, "$NetBSD: vfs_bio.c,v 1.221 2009/11/11 09:15:42 rmind Exp $");
129 #include "opt_bufcache.h"
131 #include <sys/param.h>
132 #include <sys/systm.h>
133 #include <sys/kernel.h>
134 #include <sys/proc.h>
136 #include <sys/vnode.h>
137 #include <sys/mount.h>
138 #include <sys/resourcevar.h>
139 #include <sys/sysctl.h>
140 #include <sys/conf.h>
141 #include <sys/kauth.h>
142 #include <sys/fstrans.h>
143 #include <sys/intr.h>
145 #include <sys/wapbl.h>
149 #include <miscfs/specfs/specdev.h>
156 # if (BUFCACHE < 5) || (BUFCACHE > 95)
157 # error BUFCACHE is not between 5 and 95
163 u_int nbuf
; /* desired number of buffer headers */
164 u_int bufpages
= BUFPAGES
; /* optional hardwired count */
165 u_int bufcache
= BUFCACHE
; /* max % of RAM to use for buffer cache */
167 /* Function prototypes */
170 static void buf_setwm(void);
171 static int buf_trim(void);
172 static void *bufpool_page_alloc(struct pool
*, int);
173 static void bufpool_page_free(struct pool
*, void *);
174 static buf_t
*bio_doread(struct vnode
*, daddr_t
, int,
176 static buf_t
*getnewbuf(int, int, int);
177 static int buf_lotsfree(void);
178 static int buf_canrelease(void);
179 static u_long
buf_mempoolidx(u_long
);
180 static u_long
buf_roundsize(u_long
);
181 static void *buf_malloc(size_t);
182 static void buf_mrelease(void *, size_t);
183 static void binsheadfree(buf_t
*, struct bqueue
*);
184 static void binstailfree(buf_t
*, struct bqueue
*);
185 int count_lock_queue(void); /* XXX */
187 static int checkfreelist(buf_t
*, struct bqueue
*, int);
189 static void biointr(void *);
190 static void biodone2(buf_t
*);
191 static void bref(buf_t
*);
192 static void brele(buf_t
*);
193 static void sysctl_kern_buf_setup(void);
194 static void sysctl_vm_buf_setup(void);
197 * Definitions for the buffer hash lists.
199 #define BUFHASH(dvp, lbn) \
200 (&bufhashtbl[(((long)(dvp) >> 8) + (int)(lbn)) & bufhash])
201 LIST_HEAD(bufhashhdr
, buf
) *bufhashtbl
, invalhash
;
203 struct bqueue bufqueues
[BQUEUES
];
205 static kcondvar_t needbuffer_cv
;
210 kmutex_t bufcache_lock
;
211 kmutex_t buffer_lock
;
213 /* Software ISR for completed transfers. */
214 static void *biodone_sih
;
216 /* Buffer pool for I/O buffers. */
217 static pool_cache_t buf_cache
;
218 static pool_cache_t bufio_cache
;
220 /* XXX - somewhat gross.. */
221 #if MAXBSIZE == 0x2000
223 #elif MAXBSIZE == 0x4000
225 #elif MAXBSIZE == 0x8000
231 #define MEMPOOL_INDEX_OFFSET 9 /* smallest pool is 512 bytes */
232 #if (1 << (NMEMPOOLS + MEMPOOL_INDEX_OFFSET - 1)) != MAXBSIZE
233 #error update vfs_bio buffer memory parameters
236 /* Buffer memory pools */
237 static struct pool bmempools
[NMEMPOOLS
];
239 static struct vm_map
*buf_map
;
242 * Buffer memory pool allocator.
245 bufpool_page_alloc(struct pool
*pp
, int flags
)
248 return (void *)uvm_km_alloc(buf_map
,
250 ((flags
& PR_WAITOK
) ? 0 : UVM_KMF_NOWAIT
| UVM_KMF_TRYLOCK
)
255 bufpool_page_free(struct pool
*pp
, void *v
)
258 uvm_km_free(buf_map
, (vaddr_t
)v
, MAXBSIZE
, UVM_KMF_WIRED
);
261 static struct pool_allocator bufmempool_allocator
= {
262 .pa_alloc
= bufpool_page_alloc
,
263 .pa_free
= bufpool_page_free
,
264 .pa_pagesz
= MAXBSIZE
,
267 /* Buffer memory management variables */
268 u_long bufmem_valimit
;
269 u_long bufmem_hiwater
;
270 u_long bufmem_lowater
;
274 * MD code can call this to set a hard limit on the amount
275 * of virtual memory used by the buffer cache.
278 buf_setvalimit(vsize_t sz
)
281 /* We need to accommodate at least NMEMPOOLS of MAXBSIZE each */
282 if (sz
< NMEMPOOLS
* MAXBSIZE
)
293 bufmem_hiwater
= buf_memcalc();
294 /* lowater is approx. 2% of memory (with bufcache = 15) */
295 #define BUFMEM_WMSHIFT 3
296 #define BUFMEM_HIWMMIN (64 * 1024 << BUFMEM_WMSHIFT)
297 if (bufmem_hiwater
< BUFMEM_HIWMMIN
)
298 /* Ensure a reasonable minimum value */
299 bufmem_hiwater
= BUFMEM_HIWMMIN
;
300 bufmem_lowater
= bufmem_hiwater
>> BUFMEM_WMSHIFT
;
304 int debug_verify_freelist
= 0;
306 checkfreelist(buf_t
*bp
, struct bqueue
*dp
, int ison
)
310 if (!debug_verify_freelist
)
313 TAILQ_FOREACH(b
, &dp
->bq_queue
, b_freelist
) {
323 * Insq/Remq for the buffer hash lists.
324 * Call with buffer queue locked.
327 binsheadfree(buf_t
*bp
, struct bqueue
*dp
)
330 KASSERT(mutex_owned(&bufcache_lock
));
331 KASSERT(bp
->b_freelistindex
== -1);
332 TAILQ_INSERT_HEAD(&dp
->bq_queue
, bp
, b_freelist
);
333 dp
->bq_bytes
+= bp
->b_bufsize
;
334 bp
->b_freelistindex
= dp
- bufqueues
;
338 binstailfree(buf_t
*bp
, struct bqueue
*dp
)
341 KASSERT(mutex_owned(&bufcache_lock
));
342 KASSERT(bp
->b_freelistindex
== -1);
343 TAILQ_INSERT_TAIL(&dp
->bq_queue
, bp
, b_freelist
);
344 dp
->bq_bytes
+= bp
->b_bufsize
;
345 bp
->b_freelistindex
= dp
- bufqueues
;
352 int bqidx
= bp
->b_freelistindex
;
354 KASSERT(mutex_owned(&bufcache_lock
));
356 KASSERT(bqidx
!= -1);
357 dp
= &bufqueues
[bqidx
];
358 KDASSERT(checkfreelist(bp
, dp
, 1));
359 KASSERT(dp
->bq_bytes
>= bp
->b_bufsize
);
360 TAILQ_REMOVE(&dp
->bq_queue
, bp
, b_freelist
);
361 dp
->bq_bytes
-= bp
->b_bufsize
;
363 /* For the sysctl helper. */
364 if (bp
== dp
->bq_marker
)
365 dp
->bq_marker
= NULL
;
367 #if defined(DIAGNOSTIC)
368 bp
->b_freelistindex
= -1;
369 #endif /* defined(DIAGNOSTIC) */
373 * Add a reference to an buffer structure that came from buf_cache.
379 KASSERT(mutex_owned(&bufcache_lock
));
380 KASSERT(bp
->b_refcnt
> 0);
386 * Free an unused buffer structure that came from buf_cache.
392 KASSERT(mutex_owned(&bufcache_lock
));
393 KASSERT(bp
->b_refcnt
> 0);
395 if (bp
->b_refcnt
-- == 1) {
398 memset((char *)bp
, 0, sizeof(*bp
));
400 pool_cache_put(buf_cache
, bp
);
405 * note that for some ports this is used by pmap bootstrap code to
406 * determine kva size.
414 * Determine the upper bound of memory to use for buffers.
416 * - If bufpages is specified, use that as the number
419 * - Otherwise, use bufcache as the percentage of
426 printf("forcing bufcache %d -> 5", bufcache
);
430 printf("forcing bufcache %d -> 95", bufcache
);
433 n
= calc_cache_size(buf_map
, bufcache
,
434 (buf_map
!= kernel_map
) ? 100 : BUFCACHE_VA_MAXPCT
)
439 if (bufmem_valimit
!= 0 && n
> bufmem_valimit
)
446 * Initialize buffers and hash links for buffers.
455 mutex_init(&bufcache_lock
, MUTEX_DEFAULT
, IPL_NONE
);
456 mutex_init(&buffer_lock
, MUTEX_DEFAULT
, IPL_NONE
);
457 cv_init(&needbuffer_cv
, "needbuf");
459 if (bufmem_valimit
!= 0) {
460 vaddr_t minaddr
= 0, maxaddr
;
461 buf_map
= uvm_km_suballoc(kernel_map
, &minaddr
, &maxaddr
,
462 bufmem_valimit
, 0, false, 0);
464 panic("bufinit: cannot allocate submap");
466 buf_map
= kernel_map
;
469 * Initialize buffer cache memory parameters.
474 /* On "small" machines use small pool page sizes where possible */
475 use_std
= (physmem
< atop(16*1024*1024));
478 * Also use them on systems that can map the pool pages using
479 * a direct-mapped segment.
481 #ifdef PMAP_MAP_POOLPAGE
485 buf_cache
= pool_cache_init(sizeof(buf_t
), 0, 0, 0,
486 "bufpl", NULL
, IPL_SOFTBIO
, NULL
, NULL
, NULL
);
487 bufio_cache
= pool_cache_init(sizeof(buf_t
), 0, 0, 0,
488 "biopl", NULL
, IPL_BIO
, NULL
, NULL
, NULL
);
490 bufmempool_allocator
.pa_backingmap
= buf_map
;
491 for (i
= 0; i
< NMEMPOOLS
; i
++) {
492 struct pool_allocator
*pa
;
493 struct pool
*pp
= &bmempools
[i
];
494 u_int size
= 1 << (i
+ MEMPOOL_INDEX_OFFSET
);
495 char *name
= kmem_alloc(8, KM_SLEEP
); /* XXX: never freed */
496 if (__predict_true(size
>= 1024))
497 (void)snprintf(name
, 8, "buf%dk", size
/ 1024);
499 (void)snprintf(name
, 8, "buf%db", size
);
500 pa
= (size
<= PAGE_SIZE
&& use_std
)
501 ? &pool_allocator_nointr
502 : &bufmempool_allocator
;
503 pool_init(pp
, size
, 0, 0, 0, name
, pa
, IPL_NONE
);
504 pool_setlowat(pp
, 1);
505 pool_sethiwat(pp
, 1);
508 /* Initialize the buffer queues */
509 for (dp
= bufqueues
; dp
< &bufqueues
[BQUEUES
]; dp
++) {
510 TAILQ_INIT(&dp
->bq_queue
);
515 * Estimate hash table size based on the amount of memory we
516 * intend to use for the buffer cache. The average buffer
517 * size is dependent on our clients (i.e. filesystems).
519 * For now, use an empirical 3K per buffer.
521 nbuf
= (bufmem_hiwater
/ 1024) / 3;
522 bufhashtbl
= hashinit(nbuf
, HASH_LIST
, true, &bufhash
);
524 sysctl_kern_buf_setup();
525 sysctl_vm_buf_setup();
532 biodone_sih
= softint_establish(SOFTINT_BIO
| SOFTINT_MPSAFE
, biointr
,
534 if (biodone_sih
== NULL
)
535 panic("bufinit2: can't establish soft interrupt");
543 /* Always allocate if less than the low water mark. */
544 if (bufmem
< bufmem_lowater
)
547 /* Never allocate if greater than the high water mark. */
548 if (bufmem
> bufmem_hiwater
)
551 /* If there's anything on the AGE list, it should be eaten. */
552 if (TAILQ_FIRST(&bufqueues
[BQ_AGE
].bq_queue
) != NULL
)
556 * The probabily of getting a new allocation is inversely
557 * proportional to the current size of the cache, using
558 * a granularity of 16 steps.
560 try = random() & 0x0000000fL
;
562 /* Don't use "16 * bufmem" here to avoid a 32-bit overflow. */
563 thresh
= (bufmem
- bufmem_lowater
) /
564 ((bufmem_hiwater
- bufmem_lowater
) / 16);
569 /* Otherwise don't allocate. */
574 * Return estimate of bytes we think need to be
575 * released to help resolve low memory conditions.
577 * => called with bufcache_lock held.
582 int pagedemand
, ninvalid
= 0;
584 KASSERT(mutex_owned(&bufcache_lock
));
586 if (bufmem
< bufmem_lowater
)
589 if (bufmem
> bufmem_hiwater
)
590 return bufmem
- bufmem_hiwater
;
592 ninvalid
+= bufqueues
[BQ_AGE
].bq_bytes
;
594 pagedemand
= uvmexp
.freetarg
- uvmexp
.free
;
597 return MAX(ninvalid
, MIN(2 * MAXBSIZE
,
598 MIN((bufmem
- bufmem_lowater
) / 16, pagedemand
* PAGE_SIZE
)));
602 * Buffer memory allocation helper functions
605 buf_mempoolidx(u_long size
)
610 size
>>= MEMPOOL_INDEX_OFFSET
;
616 panic("buf mem pool index %d", n
);
621 buf_roundsize(u_long size
)
623 /* Round up to nearest power of 2 */
624 return (1 << (buf_mempoolidx(size
) + MEMPOOL_INDEX_OFFSET
));
628 buf_malloc(size_t size
)
630 u_int n
= buf_mempoolidx(size
);
634 addr
= pool_get(&bmempools
[n
], PR_NOWAIT
);
638 /* No memory, see if we can free some. If so, try again */
639 mutex_enter(&bufcache_lock
);
640 if (buf_drain(1) > 0) {
641 mutex_exit(&bufcache_lock
);
645 if (curlwp
== uvm
.pagedaemon_lwp
) {
646 mutex_exit(&bufcache_lock
);
650 /* Wait for buffers to arrive on the LRU queue */
651 cv_timedwait(&needbuffer_cv
, &bufcache_lock
, hz
/ 4);
652 mutex_exit(&bufcache_lock
);
659 buf_mrelease(void *addr
, size_t size
)
662 pool_put(&bmempools
[buf_mempoolidx(size
)], addr
);
666 * bread()/breadn() helper.
669 bio_doread(struct vnode
*vp
, daddr_t blkno
, int size
, kauth_cred_t cred
,
675 bp
= getblk(vp
, blkno
, size
, 0, 0);
679 panic("bio_doread: no such buf");
684 * If buffer does not have data valid, start a read.
685 * Note that if buffer is BC_INVAL, getblk() won't return it.
686 * Therefore, it's valid if its I/O has completed or been delayed.
688 if (!ISSET(bp
->b_oflags
, (BO_DONE
| BO_DELWRI
))) {
689 /* Start I/O for the buffer. */
690 SET(bp
->b_flags
, B_READ
| async
);
692 BIO_SETPRIO(bp
, BPRIO_TIMELIMITED
);
694 BIO_SETPRIO(bp
, BPRIO_TIMECRITICAL
);
695 VOP_STRATEGY(vp
, bp
);
697 /* Pay for the read. */
698 curlwp
->l_ru
.ru_inblock
++;
702 if (vp
->v_type
== VBLK
)
703 mp
= vp
->v_specmountpoint
;
708 * Collect statistics on synchronous and asynchronous reads.
709 * Reads from block devices are charged to their associated
710 * filesystem (if any).
714 mp
->mnt_stat
.f_syncreads
++;
716 mp
->mnt_stat
.f_asyncreads
++;
724 * This algorithm described in Bach (p.54).
727 bread(struct vnode
*vp
, daddr_t blkno
, int size
, kauth_cred_t cred
,
728 int flags
, buf_t
**bpp
)
733 /* Get buffer for block. */
734 bp
= *bpp
= bio_doread(vp
, blkno
, size
, cred
, 0);
736 /* Wait for the read to complete, and return result. */
738 if (error
== 0 && (flags
& B_MODIFY
) != 0)
739 error
= fscow_run(bp
, true);
745 * Read-ahead multiple disk blocks. The first is sync, the rest async.
746 * Trivial modification to the breada algorithm presented in Bach (p.55).
749 breadn(struct vnode
*vp
, daddr_t blkno
, int size
, daddr_t
*rablks
,
750 int *rasizes
, int nrablks
, kauth_cred_t cred
, int flags
, buf_t
**bpp
)
755 bp
= *bpp
= bio_doread(vp
, blkno
, size
, cred
, 0);
758 * For each of the read-ahead blocks, start a read, if necessary.
760 mutex_enter(&bufcache_lock
);
761 for (i
= 0; i
< nrablks
; i
++) {
762 /* If it's in the cache, just go on to next one. */
763 if (incore(vp
, rablks
[i
]))
766 /* Get a buffer for the read-ahead block */
767 mutex_exit(&bufcache_lock
);
768 (void) bio_doread(vp
, rablks
[i
], rasizes
[i
], cred
, B_ASYNC
);
769 mutex_enter(&bufcache_lock
);
771 mutex_exit(&bufcache_lock
);
773 /* Otherwise, we had to start a read for it; wait until it's valid. */
775 if (error
== 0 && (flags
& B_MODIFY
) != 0)
776 error
= fscow_run(bp
, true);
781 * Block write. Described in Bach (p.56)
786 int rv
, sync
, wasdelayed
;
790 KASSERT(ISSET(bp
->b_cflags
, BC_BUSY
));
791 KASSERT(!cv_has_waiters(&bp
->b_done
));
795 KASSERT(bp
->b_objlock
== &vp
->v_interlock
);
796 if (vp
->v_type
== VBLK
)
797 mp
= vp
->v_specmountpoint
;
804 if (mp
&& mp
->mnt_wapbl
) {
805 if (bp
->b_iodone
!= mp
->mnt_wapbl_op
->wo_wapbl_biodone
) {
812 * Remember buffer type, to switch on it later. If the write was
813 * synchronous, but the file system was mounted with MNT_ASYNC,
814 * convert it to a delayed write.
815 * XXX note that this relies on delayed tape writes being converted
816 * to async, not sync writes (which is safe, but ugly).
818 sync
= !ISSET(bp
->b_flags
, B_ASYNC
);
819 if (sync
&& mp
!= NULL
&& ISSET(mp
->mnt_flag
, MNT_ASYNC
)) {
825 * Collect statistics on synchronous and asynchronous writes.
826 * Writes to block devices are charged to their associated
827 * filesystem (if any).
831 mp
->mnt_stat
.f_syncwrites
++;
833 mp
->mnt_stat
.f_asyncwrites
++;
837 * Pay for the I/O operation and make sure the buf is on the correct
841 wasdelayed
= ISSET(bp
->b_oflags
, BO_DELWRI
);
842 CLR(bp
->b_flags
, B_READ
);
844 mutex_enter(&bufcache_lock
);
845 mutex_enter(bp
->b_objlock
);
846 CLR(bp
->b_oflags
, BO_DONE
| BO_DELWRI
);
847 reassignbuf(bp
, bp
->b_vp
);
848 mutex_exit(&bufcache_lock
);
850 curlwp
->l_ru
.ru_oublock
++;
851 mutex_enter(bp
->b_objlock
);
852 CLR(bp
->b_oflags
, BO_DONE
| BO_DELWRI
);
856 mutex_exit(bp
->b_objlock
);
858 /* Initiate disk write. */
860 BIO_SETPRIO(bp
, BPRIO_TIMECRITICAL
);
862 BIO_SETPRIO(bp
, BPRIO_TIMELIMITED
);
864 VOP_STRATEGY(vp
, bp
);
867 /* If I/O was synchronous, wait for it to complete. */
870 /* Release the buffer. */
882 struct vop_bwrite_args
*ap
= v
;
884 return (bwrite(ap
->a_bp
));
890 * The buffer is marked dirty, but is not queued for I/O.
891 * This routine should be used when the buffer is expected
892 * to be modified again soon, typically a small write that
893 * partially fills a buffer.
895 * NB: magnetic tapes cannot be delayed; they must be
896 * written in the order that the writes are requested.
898 * Described in Leffler, et al. (pp. 208-213).
904 KASSERT(bp
->b_vp
== NULL
|| bp
->b_vp
->v_tag
!= VT_UFS
||
905 bp
->b_vp
->v_type
== VBLK
|| ISSET(bp
->b_flags
, B_COWDONE
));
906 KASSERT(ISSET(bp
->b_cflags
, BC_BUSY
));
907 KASSERT(!cv_has_waiters(&bp
->b_done
));
909 /* If this is a tape block, write the block now. */
910 if (bdev_type(bp
->b_dev
) == D_TAPE
) {
915 if (wapbl_vphaswapbl(bp
->b_vp
)) {
916 struct mount
*mp
= wapbl_vptomp(bp
->b_vp
);
918 if (bp
->b_iodone
!= mp
->mnt_wapbl_op
->wo_wapbl_biodone
) {
919 WAPBL_ADD_BUF(mp
, bp
);
924 * If the block hasn't been seen before:
925 * (1) Mark it as having been seen,
926 * (2) Charge for the write,
927 * (3) Make sure it's on its vnode's correct block list.
929 KASSERT(bp
->b_vp
== NULL
|| bp
->b_objlock
== &bp
->b_vp
->v_interlock
);
931 if (!ISSET(bp
->b_oflags
, BO_DELWRI
)) {
932 mutex_enter(&bufcache_lock
);
933 mutex_enter(bp
->b_objlock
);
934 SET(bp
->b_oflags
, BO_DELWRI
);
935 curlwp
->l_ru
.ru_oublock
++;
936 reassignbuf(bp
, bp
->b_vp
);
937 mutex_exit(&bufcache_lock
);
939 mutex_enter(bp
->b_objlock
);
941 /* Otherwise, the "write" is done, so mark and release the buffer. */
942 CLR(bp
->b_oflags
, BO_DONE
);
943 mutex_exit(bp
->b_objlock
);
949 * Asynchronous block write; just an asynchronous bwrite().
955 KASSERT(ISSET(bp
->b_cflags
, BC_BUSY
));
957 SET(bp
->b_flags
, B_ASYNC
);
962 * Release a buffer on to the free lists.
963 * Described in Bach (p. 46).
966 brelsel(buf_t
*bp
, int set
)
971 KASSERT(mutex_owned(&bufcache_lock
));
972 KASSERT(!cv_has_waiters(&bp
->b_done
));
973 KASSERT(bp
->b_refcnt
> 0);
975 SET(bp
->b_cflags
, set
);
977 KASSERT(ISSET(bp
->b_cflags
, BC_BUSY
));
978 KASSERT(bp
->b_iodone
== NULL
);
980 /* Wake up any processes waiting for any buffer to become free. */
981 cv_signal(&needbuffer_cv
);
983 /* Wake up any proceeses waiting for _this_ buffer to become */
984 if (ISSET(bp
->b_cflags
, BC_WANTED
))
985 CLR(bp
->b_cflags
, BC_WANTED
|BC_AGE
);
988 * Determine which queue the buffer should be on, then put it there.
991 /* If it's locked, don't report an error; try again later. */
992 if (ISSET(bp
->b_flags
, B_LOCKED
))
995 /* If it's not cacheable, or an error, mark it invalid. */
996 if (ISSET(bp
->b_cflags
, BC_NOCACHE
) || bp
->b_error
!= 0)
997 SET(bp
->b_cflags
, BC_INVAL
);
999 if (ISSET(bp
->b_cflags
, BC_VFLUSH
)) {
1001 * This is a delayed write buffer that was just flushed to
1002 * disk. It is still on the LRU queue. If it's become
1003 * invalid, then we need to move it to a different queue;
1004 * otherwise leave it in its current position.
1006 CLR(bp
->b_cflags
, BC_VFLUSH
);
1007 if (!ISSET(bp
->b_cflags
, BC_INVAL
|BC_AGE
) &&
1008 !ISSET(bp
->b_flags
, B_LOCKED
) && bp
->b_error
== 0) {
1009 KDASSERT(checkfreelist(bp
, &bufqueues
[BQ_LRU
], 1));
1010 goto already_queued
;
1016 KDASSERT(checkfreelist(bp
, &bufqueues
[BQ_AGE
], 0));
1017 KDASSERT(checkfreelist(bp
, &bufqueues
[BQ_LRU
], 0));
1018 KDASSERT(checkfreelist(bp
, &bufqueues
[BQ_LOCKED
], 0));
1020 if ((bp
->b_bufsize
<= 0) || ISSET(bp
->b_cflags
, BC_INVAL
)) {
1022 * If it's invalid or empty, dissociate it from its vnode
1023 * and put on the head of the appropriate queue.
1025 if (ISSET(bp
->b_flags
, B_LOCKED
)) {
1026 if (wapbl_vphaswapbl(vp
= bp
->b_vp
)) {
1027 struct mount
*mp
= wapbl_vptomp(vp
);
1029 KASSERT(bp
->b_iodone
1030 != mp
->mnt_wapbl_op
->wo_wapbl_biodone
);
1031 WAPBL_REMOVE_BUF(mp
, bp
);
1035 mutex_enter(bp
->b_objlock
);
1036 CLR(bp
->b_oflags
, BO_DONE
|BO_DELWRI
);
1037 if ((vp
= bp
->b_vp
) != NULL
) {
1038 KASSERT(bp
->b_objlock
== &vp
->v_interlock
);
1039 reassignbuf(bp
, bp
->b_vp
);
1041 mutex_exit(&vp
->v_interlock
);
1043 KASSERT(bp
->b_objlock
== &buffer_lock
);
1044 mutex_exit(bp
->b_objlock
);
1047 if (bp
->b_bufsize
<= 0)
1049 goto already_queued
;
1052 bufq
= &bufqueues
[BQ_AGE
];
1053 binsheadfree(bp
, bufq
);
1056 * It has valid data. Put it on the end of the appropriate
1057 * queue, so that it'll stick around for as long as possible.
1058 * If buf is AGE, but has dependencies, must put it on last
1059 * bufqueue to be scanned, ie LRU. This protects against the
1060 * livelock where BQ_AGE only has buffers with dependencies,
1061 * and we thus never get to the dependent buffers in BQ_LRU.
1063 if (ISSET(bp
->b_flags
, B_LOCKED
)) {
1064 /* locked in core */
1065 bufq
= &bufqueues
[BQ_LOCKED
];
1066 } else if (!ISSET(bp
->b_cflags
, BC_AGE
)) {
1068 bufq
= &bufqueues
[BQ_LRU
];
1070 /* stale but valid data */
1071 bufq
= &bufqueues
[BQ_AGE
];
1073 binstailfree(bp
, bufq
);
1076 /* Unlock the buffer. */
1077 CLR(bp
->b_cflags
, BC_AGE
|BC_BUSY
|BC_NOCACHE
);
1078 CLR(bp
->b_flags
, B_ASYNC
);
1079 cv_broadcast(&bp
->b_busy
);
1081 if (bp
->b_bufsize
<= 0)
1086 brelse(buf_t
*bp
, int set
)
1089 mutex_enter(&bufcache_lock
);
1091 mutex_exit(&bufcache_lock
);
1095 * Determine if a block is in the cache.
1096 * Just look on what would be its hash chain. If it's there, return
1097 * a pointer to it, unless it's marked invalid. If it's marked invalid,
1098 * we normally don't return the buffer, unless the caller explicitly
1102 incore(struct vnode
*vp
, daddr_t blkno
)
1106 KASSERT(mutex_owned(&bufcache_lock
));
1108 /* Search hash chain */
1109 LIST_FOREACH(bp
, BUFHASH(vp
, blkno
), b_hash
) {
1110 if (bp
->b_lblkno
== blkno
&& bp
->b_vp
== vp
&&
1111 !ISSET(bp
->b_cflags
, BC_INVAL
)) {
1112 KASSERT(bp
->b_objlock
== &vp
->v_interlock
);
1121 * Get a block of requested size that is associated with
1122 * a given vnode and block offset. If it is found in the
1123 * block cache, mark it as having been found, make it busy
1124 * and return it. Otherwise, return an empty block of the
1125 * correct size. It is up to the caller to insure that the
1126 * cached blocks be of the correct size.
1129 getblk(struct vnode
*vp
, daddr_t blkno
, int size
, int slpflag
, int slptimeo
)
1134 mutex_enter(&bufcache_lock
);
1136 bp
= incore(vp
, blkno
);
1138 err
= bbusy(bp
, ((slpflag
& PCATCH
) != 0), slptimeo
, NULL
);
1140 if (err
== EPASSTHROUGH
)
1142 mutex_exit(&bufcache_lock
);
1145 KASSERT(!cv_has_waiters(&bp
->b_done
));
1147 if (ISSET(bp
->b_oflags
, BO_DONE
|BO_DELWRI
) &&
1148 bp
->b_bcount
< size
&& vp
->v_type
!= VBLK
)
1149 panic("getblk: block size invariant failed");
1154 if ((bp
= getnewbuf(slpflag
, slptimeo
, 0)) == NULL
)
1157 if (incore(vp
, blkno
) != NULL
) {
1158 /* The block has come into memory in the meantime. */
1163 LIST_INSERT_HEAD(BUFHASH(vp
, blkno
), bp
, b_hash
);
1164 bp
->b_blkno
= bp
->b_lblkno
= bp
->b_rawblkno
= blkno
;
1165 mutex_enter(&vp
->v_interlock
);
1167 mutex_exit(&vp
->v_interlock
);
1170 mutex_exit(&bufcache_lock
);
1173 * LFS can't track total size of B_LOCKED buffer (locked_queue_bytes)
1174 * if we re-size buffers here.
1176 if (ISSET(bp
->b_flags
, B_LOCKED
)) {
1177 KASSERT(bp
->b_bufsize
>= size
);
1179 if (allocbuf(bp
, size
, preserve
)) {
1180 mutex_enter(&bufcache_lock
);
1181 LIST_REMOVE(bp
, b_hash
);
1182 mutex_exit(&bufcache_lock
);
1183 brelse(bp
, BC_INVAL
);
1187 BIO_SETPRIO(bp
, BPRIO_DEFAULT
);
1192 * Get an empty, disassociated buffer of given size.
1200 mutex_enter(&bufcache_lock
);
1201 while ((bp
= getnewbuf(0, 0, 0)) == NULL
)
1204 SET(bp
->b_cflags
, BC_INVAL
);
1205 LIST_INSERT_HEAD(&invalhash
, bp
, b_hash
);
1206 mutex_exit(&bufcache_lock
);
1207 BIO_SETPRIO(bp
, BPRIO_DEFAULT
);
1208 error
= allocbuf(bp
, size
, 0);
1209 KASSERT(error
== 0);
1214 * Expand or contract the actual memory allocated to a buffer.
1216 * If the buffer shrinks, data is lost, so it's up to the
1217 * caller to have written it out *first*; this routine will not
1218 * start a write. If the buffer grows, it's the callers
1219 * responsibility to fill out the buffer's additional contents.
1222 allocbuf(buf_t
*bp
, int size
, int preserve
)
1225 vsize_t oldsize
, desired_size
;
1229 desired_size
= buf_roundsize(size
);
1230 if (desired_size
> MAXBSIZE
)
1231 printf("allocbuf: buffer larger than MAXBSIZE requested");
1233 oldcount
= bp
->b_bcount
;
1235 bp
->b_bcount
= size
;
1237 oldsize
= bp
->b_bufsize
;
1238 if (oldsize
== desired_size
) {
1240 * Do not short cut the WAPBL resize, as the buffer length
1241 * could still have changed and this would corrupt the
1242 * tracking of the transaction length.
1248 * If we want a buffer of a different size, re-allocate the
1249 * buffer's memory; copy old content only if needed.
1251 addr
= buf_malloc(desired_size
);
1255 memcpy(addr
, bp
->b_data
, MIN(oldsize
,desired_size
));
1256 if (bp
->b_data
!= NULL
)
1257 buf_mrelease(bp
->b_data
, oldsize
);
1259 bp
->b_bufsize
= desired_size
;
1262 * Update overall buffer memory counter (protected by bufcache_lock)
1264 delta
= (long)desired_size
- (long)oldsize
;
1266 mutex_enter(&bufcache_lock
);
1267 if ((bufmem
+= delta
) > bufmem_hiwater
) {
1269 * Need to trim overall memory usage.
1271 while (buf_canrelease()) {
1272 if (curcpu()->ci_schedstate
.spc_flags
&
1274 mutex_exit(&bufcache_lock
);
1276 mutex_enter(&bufcache_lock
);
1278 if (buf_trim() == 0)
1282 mutex_exit(&bufcache_lock
);
1285 if (wapbl_vphaswapbl(bp
->b_vp
))
1286 WAPBL_RESIZE_BUF(wapbl_vptomp(bp
->b_vp
), bp
, oldsize
, oldcount
);
1292 * Find a buffer which is available for use.
1293 * Select something from a free list.
1294 * Preference is to AGE list, then LRU list.
1296 * Called with the buffer queues locked.
1297 * Return buffer locked.
1300 getnewbuf(int slpflag
, int slptimeo
, int from_bufq
)
1306 KASSERT(mutex_owned(&bufcache_lock
));
1309 * Get a new buffer from the pool.
1311 if (!from_bufq
&& buf_lotsfree()) {
1312 mutex_exit(&bufcache_lock
);
1313 bp
= pool_cache_get(buf_cache
, PR_NOWAIT
);
1315 memset((char *)bp
, 0, sizeof(*bp
));
1317 SET(bp
->b_cflags
, BC_BUSY
); /* mark buffer busy */
1318 mutex_enter(&bufcache_lock
);
1319 #if defined(DIAGNOSTIC)
1320 bp
->b_freelistindex
= -1;
1321 #endif /* defined(DIAGNOSTIC) */
1324 mutex_enter(&bufcache_lock
);
1327 KASSERT(mutex_owned(&bufcache_lock
));
1328 if ((bp
= TAILQ_FIRST(&bufqueues
[BQ_AGE
].bq_queue
)) != NULL
||
1329 (bp
= TAILQ_FIRST(&bufqueues
[BQ_LRU
].bq_queue
)) != NULL
) {
1330 KASSERT(!ISSET(bp
->b_cflags
, BC_BUSY
) || ISSET(bp
->b_cflags
, BC_VFLUSH
));
1333 /* Buffer is no longer on free lists. */
1334 SET(bp
->b_cflags
, BC_BUSY
);
1337 * XXX: !from_bufq should be removed.
1339 if (!from_bufq
|| curlwp
!= uvm
.pagedaemon_lwp
) {
1340 /* wait for a free buffer of any kind */
1341 if ((slpflag
& PCATCH
) != 0)
1342 (void)cv_timedwait_sig(&needbuffer_cv
,
1343 &bufcache_lock
, slptimeo
);
1345 (void)cv_timedwait(&needbuffer_cv
,
1346 &bufcache_lock
, slptimeo
);
1352 if (bp
->b_bufsize
<= 0)
1353 panic("buffer %p: on queue but empty", bp
);
1356 if (ISSET(bp
->b_cflags
, BC_VFLUSH
)) {
1358 * This is a delayed write buffer being flushed to disk. Make
1359 * sure it gets aged out of the queue when it's finished, and
1360 * leave it off the LRU queue.
1362 CLR(bp
->b_cflags
, BC_VFLUSH
);
1363 SET(bp
->b_cflags
, BC_AGE
);
1367 KASSERT(ISSET(bp
->b_cflags
, BC_BUSY
));
1368 KASSERT(bp
->b_refcnt
> 0);
1369 KASSERT(!cv_has_waiters(&bp
->b_done
));
1372 * If buffer was a delayed write, start it and return NULL
1373 * (since we might sleep while starting the write).
1375 if (ISSET(bp
->b_oflags
, BO_DELWRI
)) {
1377 * This buffer has gone through the LRU, so make sure it gets
1380 SET(bp
->b_cflags
, BC_AGE
);
1381 mutex_exit(&bufcache_lock
);
1383 mutex_enter(&bufcache_lock
);
1389 /* clear out various other fields */
1390 bp
->b_cflags
= BC_BUSY
;
1402 LIST_REMOVE(bp
, b_hash
);
1404 /* Disassociate us from our vnode, if we had one... */
1406 mutex_enter(&vp
->v_interlock
);
1408 mutex_exit(&vp
->v_interlock
);
1415 * Attempt to free an aged buffer off the queues.
1416 * Called with queue lock held.
1417 * Returns the amount of buffer memory freed.
1425 KASSERT(mutex_owned(&bufcache_lock
));
1427 /* Instruct getnewbuf() to get buffers off the queues */
1428 if ((bp
= getnewbuf(PCATCH
, 1, 1)) == NULL
)
1431 KASSERT((bp
->b_cflags
& BC_WANTED
) == 0);
1432 size
= bp
->b_bufsize
;
1435 buf_mrelease(bp
->b_data
, size
);
1436 bp
->b_bcount
= bp
->b_bufsize
= 0;
1438 /* brelse() will return the buffer to the global buffer pool */
1448 KASSERT(mutex_owned(&bufcache_lock
));
1450 while (size
< n
&& bufmem
> bufmem_lowater
) {
1461 * Wait for operations on the buffer to complete.
1462 * When they do, extract and return the I/O's error value.
1468 KASSERT(ISSET(bp
->b_cflags
, BC_BUSY
));
1469 KASSERT(bp
->b_refcnt
> 0);
1471 mutex_enter(bp
->b_objlock
);
1472 while (!ISSET(bp
->b_oflags
, BO_DONE
| BO_DELWRI
))
1473 cv_wait(&bp
->b_done
, bp
->b_objlock
);
1474 mutex_exit(bp
->b_objlock
);
1480 * Mark I/O complete on a buffer.
1482 * If a callback has been requested, e.g. the pageout
1483 * daemon, do so. Otherwise, awaken waiting processes.
1485 * [ Leffler, et al., says on p.247:
1486 * "This routine wakes up the blocked process, frees the buffer
1487 * for an asynchronous write, or, for a request by the pagedaemon
1488 * process, invokes a procedure specified in the buffer structure" ]
1490 * In real life, the pagedaemon (or other system processes) wants
1491 * to do async stuff to, and doesn't want the buffer brelse()'d.
1492 * (for swap pager, that puts swap buffers on the free lists (!!!),
1493 * for the vn device, that puts malloc'd buffers on the free lists!)
1500 KASSERT(!ISSET(bp
->b_oflags
, BO_DONE
));
1503 /* From interrupt mode: defer to a soft interrupt. */
1505 TAILQ_INSERT_TAIL(&curcpu()->ci_data
.cpu_biodone
, bp
, b_actq
);
1506 softint_schedule(biodone_sih
);
1509 /* Process now - the buffer may be freed soon. */
1517 void (*callout
)(buf_t
*);
1519 mutex_enter(bp
->b_objlock
);
1520 /* Note that the transfer is done. */
1521 if (ISSET(bp
->b_oflags
, BO_DONE
))
1522 panic("biodone2 already");
1523 CLR(bp
->b_flags
, B_COWDONE
);
1524 SET(bp
->b_oflags
, BO_DONE
);
1525 BIO_SETPRIO(bp
, BPRIO_DEFAULT
);
1527 /* Wake up waiting writers. */
1528 if (!ISSET(bp
->b_flags
, B_READ
))
1531 if ((callout
= bp
->b_iodone
) != NULL
) {
1532 /* Note callout done, then call out. */
1533 KASSERT(!cv_has_waiters(&bp
->b_done
));
1534 KERNEL_LOCK(1, NULL
); /* XXXSMP */
1535 bp
->b_iodone
= NULL
;
1536 mutex_exit(bp
->b_objlock
);
1538 KERNEL_UNLOCK_ONE(NULL
); /* XXXSMP */
1539 } else if (ISSET(bp
->b_flags
, B_ASYNC
)) {
1540 /* If async, release. */
1541 KASSERT(!cv_has_waiters(&bp
->b_done
));
1542 mutex_exit(bp
->b_objlock
);
1545 /* Otherwise just wake up waiters in biowait(). */
1546 cv_broadcast(&bp
->b_done
);
1547 mutex_exit(bp
->b_objlock
);
1552 biointr(void *cookie
)
1554 struct cpu_info
*ci
;
1560 while (!TAILQ_EMPTY(&ci
->ci_data
.cpu_biodone
)) {
1561 KASSERT(curcpu() == ci
);
1564 bp
= TAILQ_FIRST(&ci
->ci_data
.cpu_biodone
);
1565 TAILQ_REMOVE(&ci
->ci_data
.cpu_biodone
, bp
, b_actq
);
1573 * Return a count of buffers on the "locked" queue.
1576 count_lock_queue(void)
1581 mutex_enter(&bufcache_lock
);
1582 TAILQ_FOREACH(bp
, &bufqueues
[BQ_LOCKED
].bq_queue
, b_freelist
)
1584 mutex_exit(&bufcache_lock
);
1589 * Wait for all buffers to complete I/O
1590 * Return the number of "stuck" buffers.
1596 int iter
, nbusy
, nbusy_prev
= 0, dcount
, ihash
;
1599 for (iter
= 0; iter
< 20;) {
1600 mutex_enter(&bufcache_lock
);
1602 for (ihash
= 0; ihash
< bufhash
+1; ihash
++) {
1603 LIST_FOREACH(bp
, &bufhashtbl
[ihash
], b_hash
) {
1604 if ((bp
->b_cflags
& (BC_BUSY
|BC_INVAL
)) == BC_BUSY
)
1605 nbusy
+= ((bp
->b_flags
& B_READ
) == 0);
1608 mutex_exit(&bufcache_lock
);
1612 if (nbusy_prev
== 0)
1614 printf("%d ", nbusy
);
1615 kpause("bflush", false, (iter
== 0) ? 1 : hz
/ 25 * iter
, NULL
);
1616 if (nbusy
>= nbusy_prev
) /* we didn't flush anything */
1623 #if defined(DEBUG) || defined(DEBUG_HALT_BUSY)
1624 printf("giving up\nPrinting vnodes for busy buffers\n");
1625 for (ihash
= 0; ihash
< bufhash
+1; ihash
++) {
1626 LIST_FOREACH(bp
, &bufhashtbl
[ihash
], b_hash
) {
1627 if ((bp
->b_cflags
& (BC_BUSY
|BC_INVAL
)) == BC_BUSY
&&
1628 (bp
->b_flags
& B_READ
) == 0)
1629 vprint(NULL
, bp
->b_vp
);
1639 sysctl_fillbuf(buf_t
*i
, struct buf_sysctl
*o
)
1642 o
->b_flags
= i
->b_flags
| i
->b_cflags
| i
->b_oflags
;
1643 o
->b_error
= i
->b_error
;
1644 o
->b_prio
= i
->b_prio
;
1645 o
->b_dev
= i
->b_dev
;
1646 o
->b_bufsize
= i
->b_bufsize
;
1647 o
->b_bcount
= i
->b_bcount
;
1648 o
->b_resid
= i
->b_resid
;
1649 o
->b_addr
= PTRTOUINT64(i
->b_data
);
1650 o
->b_blkno
= i
->b_blkno
;
1651 o
->b_rawblkno
= i
->b_rawblkno
;
1652 o
->b_iodone
= PTRTOUINT64(i
->b_iodone
);
1653 o
->b_proc
= PTRTOUINT64(i
->b_proc
);
1654 o
->b_vp
= PTRTOUINT64(i
->b_vp
);
1655 o
->b_saveaddr
= PTRTOUINT64(i
->b_saveaddr
);
1656 o
->b_lblkno
= i
->b_lblkno
;
1659 #define KERN_BUFSLOP 20
1661 sysctl_dobuf(SYSCTLFN_ARGS
)
1664 struct buf_sysctl bs
;
1668 size_t len
, needed
, elem_size
, out_size
;
1669 int error
, elem_count
, retries
;
1671 if (namelen
== 1 && name
[0] == CTL_QUERY
)
1672 return (sysctl_query(SYSCTLFN_CALL(rnode
)));
1680 len
= (oldp
!= NULL
) ? *oldlenp
: 0;
1683 elem_size
= name
[2];
1684 elem_count
= name
[3];
1685 out_size
= MIN(sizeof(bs
), elem_size
);
1688 * at the moment, these are just "placeholders" to make the
1689 * API for retrieving kern.buf data more extensible in the
1692 * XXX kern.buf currently has "netbsd32" issues. hopefully
1693 * these will be resolved at a later point.
1695 if (op
!= KERN_BUF_ALL
|| arg
!= KERN_BUF_ALL
||
1696 elem_size
< 1 || elem_count
< 0)
1702 mutex_enter(&bufcache_lock
);
1703 for (i
= 0; i
< BQUEUES
; i
++) {
1705 TAILQ_FOREACH(bp
, &bq
->bq_queue
, b_freelist
) {
1707 if (len
>= elem_size
&& elem_count
> 0) {
1708 sysctl_fillbuf(bp
, &bs
);
1709 mutex_exit(&bufcache_lock
);
1710 error
= copyout(&bs
, dp
, out_size
);
1711 mutex_enter(&bufcache_lock
);
1714 if (bq
->bq_marker
!= bp
) {
1716 * This sysctl node is only for
1717 * statistics. Retry; if the
1718 * queue keeps changing, then
1721 if (retries
-- == 0) {
1725 mutex_exit(&bufcache_lock
);
1731 needed
+= elem_size
;
1732 if (elem_count
> 0 && elem_count
!= INT_MAX
)
1738 mutex_exit(&bufcache_lock
);
1743 *oldlenp
+= KERN_BUFSLOP
* sizeof(buf_t
);
1749 sysctl_bufvm_update(SYSCTLFN_ARGS
)
1752 struct sysctlnode node
;
1755 node
.sysctl_data
= &t
;
1756 t
= *(int *)rnode
->sysctl_data
;
1757 error
= sysctl_lookup(SYSCTLFN_CALL(&node
));
1758 if (error
|| newp
== NULL
)
1763 if (rnode
->sysctl_data
== &bufcache
) {
1768 } else if (rnode
->sysctl_data
== &bufmem_lowater
) {
1769 if (bufmem_hiwater
- t
< 16)
1772 } else if (rnode
->sysctl_data
== &bufmem_hiwater
) {
1773 if (t
- bufmem_lowater
< 16)
1779 /* Drain until below new high water mark */
1781 mutex_enter(&bufcache_lock
);
1782 while ((t
= bufmem
- bufmem_hiwater
) >= 0) {
1783 rv
= buf_drain(t
/ (2 * 1024));
1787 mutex_exit(&bufcache_lock
);
1793 static struct sysctllog
*vfsbio_sysctllog
;
1796 sysctl_kern_buf_setup(void)
1799 sysctl_createv(&vfsbio_sysctllog
, 0, NULL
, NULL
,
1801 CTLTYPE_NODE
, "kern", NULL
,
1804 sysctl_createv(&vfsbio_sysctllog
, 0, NULL
, NULL
,
1806 CTLTYPE_NODE
, "buf",
1807 SYSCTL_DESCR("Kernel buffer cache information"),
1808 sysctl_dobuf
, 0, NULL
, 0,
1809 CTL_KERN
, KERN_BUF
, CTL_EOL
);
1813 sysctl_vm_buf_setup(void)
1816 sysctl_createv(&vfsbio_sysctllog
, 0, NULL
, NULL
,
1818 CTLTYPE_NODE
, "vm", NULL
,
1821 sysctl_createv(&vfsbio_sysctllog
, 0, NULL
, NULL
,
1822 CTLFLAG_PERMANENT
|CTLFLAG_READWRITE
,
1823 CTLTYPE_INT
, "bufcache",
1824 SYSCTL_DESCR("Percentage of physical memory to use for "
1826 sysctl_bufvm_update
, 0, &bufcache
, 0,
1827 CTL_VM
, CTL_CREATE
, CTL_EOL
);
1828 sysctl_createv(&vfsbio_sysctllog
, 0, NULL
, NULL
,
1829 CTLFLAG_PERMANENT
|CTLFLAG_READONLY
,
1830 CTLTYPE_INT
, "bufmem",
1831 SYSCTL_DESCR("Amount of kernel memory used by buffer "
1833 NULL
, 0, &bufmem
, 0,
1834 CTL_VM
, CTL_CREATE
, CTL_EOL
);
1835 sysctl_createv(&vfsbio_sysctllog
, 0, NULL
, NULL
,
1836 CTLFLAG_PERMANENT
|CTLFLAG_READWRITE
,
1837 CTLTYPE_INT
, "bufmem_lowater",
1838 SYSCTL_DESCR("Minimum amount of kernel memory to "
1839 "reserve for buffer cache"),
1840 sysctl_bufvm_update
, 0, &bufmem_lowater
, 0,
1841 CTL_VM
, CTL_CREATE
, CTL_EOL
);
1842 sysctl_createv(&vfsbio_sysctllog
, 0, NULL
, NULL
,
1843 CTLFLAG_PERMANENT
|CTLFLAG_READWRITE
,
1844 CTLTYPE_INT
, "bufmem_hiwater",
1845 SYSCTL_DESCR("Maximum amount of kernel memory to use "
1846 "for buffer cache"),
1847 sysctl_bufvm_update
, 0, &bufmem_hiwater
, 0,
1848 CTL_VM
, CTL_CREATE
, CTL_EOL
);
1853 * Print out statistics on the current allocation of the buffer pool.
1854 * Can be enabled to print out on every ``sync'' by setting "syncprt"
1855 * in vfs_syscalls.c using sysctl.
1863 int counts
[(MAXBSIZE
/ PAGE_SIZE
) + 1];
1864 static const char *bname
[BQUEUES
] = { "LOCKED", "LRU", "AGE" };
1866 for (dp
= bufqueues
, i
= 0; dp
< &bufqueues
[BQUEUES
]; dp
++, i
++) {
1868 for (j
= 0; j
<= MAXBSIZE
/PAGE_SIZE
; j
++)
1870 TAILQ_FOREACH(bp
, &dp
->bq_queue
, b_freelist
) {
1871 counts
[bp
->b_bufsize
/PAGE_SIZE
]++;
1874 printf("%s: total-%d", bname
[i
], count
);
1875 for (j
= 0; j
<= MAXBSIZE
/PAGE_SIZE
; j
++)
1877 printf(", %d-%d", j
* PAGE_SIZE
, counts
[j
]);
1883 /* ------------------------------ */
1886 getiobuf(struct vnode
*vp
, bool waitok
)
1890 bp
= pool_cache_get(bufio_cache
, (waitok
? PR_WAITOK
: PR_NOWAIT
));
1896 if ((bp
->b_vp
= vp
) == NULL
)
1897 bp
->b_objlock
= &buffer_lock
;
1899 bp
->b_objlock
= &vp
->v_interlock
;
1909 pool_cache_put(bufio_cache
, bp
);
1913 * nestiobuf_iodone: b_iodone callback for nested buffers.
1917 nestiobuf_iodone(buf_t
*bp
)
1919 buf_t
*mbp
= bp
->b_private
;
1923 KASSERT(bp
->b_bcount
<= bp
->b_bufsize
);
1926 error
= bp
->b_error
;
1927 if (bp
->b_error
== 0 &&
1928 (bp
->b_bcount
< bp
->b_bufsize
|| bp
->b_resid
> 0)) {
1930 * Not all got transfered, raise an error. We have no way to
1931 * propagate these conditions to mbp.
1936 donebytes
= bp
->b_bufsize
;
1939 nestiobuf_done(mbp
, donebytes
, error
);
1943 * nestiobuf_setup: setup a "nested" buffer.
1945 * => 'mbp' is a "master" buffer which is being divided into sub pieces.
1946 * => 'bp' should be a buffer allocated by getiobuf.
1947 * => 'offset' is a byte offset in the master buffer.
1948 * => 'size' is a size in bytes of this nested buffer.
1952 nestiobuf_setup(buf_t
*mbp
, buf_t
*bp
, int offset
, size_t size
)
1954 const int b_read
= mbp
->b_flags
& B_READ
;
1955 struct vnode
*vp
= mbp
->b_vp
;
1957 KASSERT(mbp
->b_bcount
>= offset
+ size
);
1959 bp
->b_dev
= mbp
->b_dev
;
1960 bp
->b_objlock
= mbp
->b_objlock
;
1961 bp
->b_cflags
= BC_BUSY
;
1962 bp
->b_flags
= B_ASYNC
| b_read
;
1963 bp
->b_iodone
= nestiobuf_iodone
;
1964 bp
->b_data
= (char *)mbp
->b_data
+ offset
;
1965 bp
->b_resid
= bp
->b_bcount
= size
;
1966 bp
->b_bufsize
= bp
->b_bcount
;
1967 bp
->b_private
= mbp
;
1968 BIO_COPYPRIO(bp
, mbp
);
1969 if (!b_read
&& vp
!= NULL
) {
1970 mutex_enter(&vp
->v_interlock
);
1972 mutex_exit(&vp
->v_interlock
);
1977 * nestiobuf_done: propagate completion to the master buffer.
1979 * => 'donebytes' specifies how many bytes in the 'mbp' is completed.
1980 * => 'error' is an errno(2) that 'donebytes' has been completed with.
1984 nestiobuf_done(buf_t
*mbp
, int donebytes
, int error
)
1987 if (donebytes
== 0) {
1990 mutex_enter(mbp
->b_objlock
);
1991 KASSERT(mbp
->b_resid
>= donebytes
);
1992 mbp
->b_resid
-= donebytes
;
1994 mbp
->b_error
= error
;
1995 if (mbp
->b_resid
== 0) {
1996 mutex_exit(mbp
->b_objlock
);
1999 mutex_exit(mbp
->b_objlock
);
2006 cv_init(&bp
->b_busy
, "biolock");
2007 cv_init(&bp
->b_done
, "biowait");
2013 bp
->b_objlock
= &buffer_lock
;
2014 bp
->b_iodone
= NULL
;
2017 bp
->b_vnbufs
.le_next
= NOLIST
;
2018 BIO_SETPRIO(bp
, BPRIO_DEFAULT
);
2022 buf_destroy(buf_t
*bp
)
2025 cv_destroy(&bp
->b_done
);
2026 cv_destroy(&bp
->b_busy
);
2030 bbusy(buf_t
*bp
, bool intr
, int timo
, kmutex_t
*interlock
)
2034 KASSERT(mutex_owned(&bufcache_lock
));
2036 if ((bp
->b_cflags
& BC_BUSY
) != 0) {
2037 if (curlwp
== uvm
.pagedaemon_lwp
)
2039 bp
->b_cflags
|= BC_WANTED
;
2041 if (interlock
!= NULL
)
2042 mutex_exit(interlock
);
2044 error
= cv_timedwait_sig(&bp
->b_busy
, &bufcache_lock
,
2047 error
= cv_timedwait(&bp
->b_busy
, &bufcache_lock
,
2051 if (interlock
!= NULL
)
2052 mutex_enter(interlock
);
2055 return EPASSTHROUGH
;
2057 bp
->b_cflags
|= BC_BUSY
;